66 files changed, 21028 insertions, 0 deletions
diff --git a/src/core/CL/cl_kernels/common/activation_layer.cl b/src/core/CL/cl_kernels/common/activation_layer.cl
new file mode 100644
index 0000000000..a04556a1ed
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/activation_layer.cl
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ACT) && defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#include "activation_float_helpers.h"
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    // Perform activation
+    data0 = ACTIVATION(ACT, DATA_TYPE, VEC_SIZE, data0, A_VAL, B_VAL);
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#endif /* defined(ACT) */
diff --git a/src/core/CL/cl_kernels/common/activation_layer_quant.cl b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
new file mode 100644
index 0000000000..38ee00b17a
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_quant_helpers.h"
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+
+#if defined(FLOAT_DOMAIN)
+// Activations performed in the float domain
+
+#include "activation_float_helpers.h"
+
+/** This performs an activation function on quantized inputs with float transformations.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant_f32(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT);
+#if defined(O1_VAL)
+    data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
+#else  // defined(O1_VAL)
+    data_flt                    = round(data_flt) * ((float)S1_VAL);
+#endif // defined(O1_VAL)
+    data_flt = ACTIVATION(ACT, float, VEC_SIZE, data_flt, A_VAL, B_VAL);
+
+#if defined(O2_VAL)
+    data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
+#else  // defined(O2_VAL)
+    data0                       = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
+#endif // defined(O2_VAL)
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#else // defined(FLOAT_DOMAIN)
+// Activations performed in the quantized domain
+
+#if defined(ACT)
+/** This performs an activation function on quantized inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    data0 = PERFORM_ACTIVATION_QUANT(ACT, data0);
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(ACT)
+#endif // defined(FLOAT_DOMAIN)
diff --git a/src/core/CL/cl_kernels/common/arg_min_max.cl b/src/core/CL/cl_kernels/common/arg_min_max.cl
new file mode 100644
index 0000000000..413fcf5333
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/arg_min_max.cl
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
+
+#define VEC_TYPE_IN VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_TYPE_OUT VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+#define VEC_SELECT_IN SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_SIGNED_INT_IN SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) (VEC_SELECT_IN) isgreater(x, y)
+#define ISLESS(x, y) (VEC_SELECT_IN) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x > y))
+#define ISLESS(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x < y))
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(ARG_MAX)
+#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
+#elif defined(ARG_MIN)
+#define CONDITION_TO_USE(x, y) ISLESS(x, y)
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+#if defined(WIDTH)
+
+#if defined(ARG_MAX)
+#define VECTOR_PREDICATE_EQ(x, y) ((x) >= (y))
+#define VECTOR_PREDICATE(x, y) ((x) > (y))
+#define SCALAR_SELECT_OP(x, y) ((x) > (y)) ? (x) : (y);
+#elif defined(ARG_MIN)
+#define VECTOR_PREDICATE_EQ(x, y) ((x) <= (y))
+#define VECTOR_PREDICATE(x, y) ((x) < (y))
+#define SCALAR_SELECT_OP(x, y) ((x) < (y)) ? (x) : (y);
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_2(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 2) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) res)
+{
+    if( VECTOR_PREDICATE_EQ(in.s0,in.s1) )
+    {
+        *min_max_val  = in.s0;
+        *min_max_idx  = res.s0;
+    }
+    else
+    {
+        *min_max_val  = in.s1;
+        *min_max_idx  = res.s1;
+    }
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_4(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 4) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 2)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s01, in.s23);
+    in.s01      = select(in.s23, in.s01, idx_sel);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_8(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 8) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 8) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s0123, in.s4567);
+    in.s0123      = select(in.s4567, in.s0123, idx_sel);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) ));
+    idx_sel.s01   = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 && CONVERT(((res.s01 < res.s23)), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
+    in.s01        = select(in.s23, in.s01, idx_sel.s01);
+    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_16(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx, VEC_DATA_TYPE(DATA_TYPE, 16) in, VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) res)
+{
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    idx_sel       = VECTOR_PREDICATE_EQ(in.s01234567, in.s89abcdef);
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 8) ));
+    idx_sel.s0123 = VECTOR_PREDICATE(in.s0123, in.s4567) || (in.s0123 == in.s4567 && CONVERT(((res.s0123 < res.s4567)), VEC_DATA_TYPE(COND_DATA_TYPE, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) ));
+    idx_sel.s01   = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 && CONVERT(((res.s01 < res.s23)), VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
+    in.s01        = select(in.s23, in.s01, idx_sel.s01);
+    res.s01       = select(res.s23, res.s01, CONVERT(idx_sel.s01,  VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) ));
+    idx_sel.s0    = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
+    res.s0        = select(res.s1, res.s0, CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
+    *min_max_val  = SCALAR_SELECT_OP(in.s0, in.s1);
+    *min_max_idx  = res.s0;
+}
+
+
+
+inline void scalar_compute_global_min_max(DATA_TYPE in_val, int idx, DATA_TYPE *out_min_max_val, DATA_TYPE_OUTPUT *out_idx)
+{
+#if defined(ARG_MAX)
+    if(in_val > *out_min_max_val)
+#else  // defined(ARG_MAX)
+    if(in_val < *out_min_max_val)
+#endif // defined(ARG_MAX)
+    {
+        *out_min_max_val = in_val;
+        *out_idx         = idx;
+    }
+}
+
+#if VEC_SIZE > 1
+#if VEC_SIZE == 16
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_16(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 8 // #if VEC_SIZE == 16
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_8(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 4 // # elif VEC_SIZE == 8
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_4(min_max_val,min_max_idx,in,res)
+#elif VEC_SIZE == 2 // elif VEC_SIZE == 4
+    #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_2(min_max_val,min_max_idx,in,res)
+#else // elif VEC_SIZE == 2
+    #error "Not supported"
+#endif // #if VEC_SIZE == 16
+
+inline VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE) init_idx_vector()
+{
+#if VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+#elif VEC_SIZE == 8 // #if VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3, 4, 5, 6, 7 };
+#elif VEC_SIZE == 4 // elif VEC_SIZE == 8
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1, 2, 3 };
+#elif VEC_SIZE == 2 // elif VEC_SIZE == 4
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = { 0, 1 };
+#else  // elif VEC_SIZE == 2
+#error "Not supported"
+#endif // #if VEC_SIZE == 16
+    return vidx;
+}
+#endif // VEC_SIZE > 1
+
+/** This kernel performs reduction on x-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The data type used for the comparing indexe must be passed at compile type using -DCOND_DATA_TYPE: e.g -DCOND_DATA_TYPE=uint
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_x(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    __global DATA_TYPE *input_addr         = (__global DATA_TYPE *)(input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y);
+    __global DATA_TYPE_OUTPUT *output_addr = (__global DATA_TYPE_OUTPUT *)(output_ptr + output_offset_first_element_in_bytes + get_global_id(1) * output_stride_y);
+
+    DATA_TYPE        final_value = input_addr[0];
+    DATA_TYPE_OUTPUT final_idx   = 0;
+
+#if VEC_SIZE > 1
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+    vidx = init_idx_vector();
+
+    int x = 0;
+    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        vals = VLOAD(VEC_SIZE)(0, (input_addr + x));
+        DATA_TYPE        local_min_max_value;
+        DATA_TYPE_OUTPUT local_min_max_idx;
+
+        VECTORIZED_OP(&local_min_max_value, &local_min_max_idx, vals, vidx);
+        local_min_max_idx += x;
+        scalar_compute_global_min_max(local_min_max_value, local_min_max_idx, &final_value, &final_idx);
+    }
+#endif // VEC_SIZE > 1
+
+#if(WIDTH % VEC_SIZE)
+    LOOP_UNROLLING(int, j, 0, 1, WIDTH % VEC_SIZE,
+    {
+        scalar_compute_global_min_max(*(input_addr + j + x), j + x, &final_value, &final_idx);
+    })
+#endif // (WIDTH % VEC_SIZE)
+
+    output_addr[0] = final_idx;
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_y(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    const int x_offs            = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT y = 1; y < HEIGHT; ++y)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)y, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH) && !defined(BATCH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)z, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DEPTH)  && !defined(BATCH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_w(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + (get_global_id(2) % DEPTH) * input_stride_z +
+                                  (get_global_id(2) / DEPTH) * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + (get_global_id(
+                                      2) % DEPTH) * output_stride_z + (get_global_id(2) / DEPTH) * output_stride_w;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)w, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
diff --git a/src/core/CL/cl_kernels/common/batchnormalization_layer.cl b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
new file mode 100644
index 0000000000..18f54907df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(EPSILON)
+/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
+ *
+ * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
+ * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
+ *       For depthwise convolution weight do not pass DIM2
+ * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
+ * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
+ * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
+ * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
+ * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
+ * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
+ * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
+ * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
+ * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
+ * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
+ * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
+ * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
+ * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
+#if defined(BIAS)
+                                            VECTOR_DECLARATION(b),
+#endif // defined(BIAS)
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var)
+#ifndef IN_PLACE_W
+                                            ,
+                                            TENSOR3D_DECLARATION(w_fused)
+#endif // ifndef IN_PLACE_W
+#ifndef IN_PLACE_B
+                                            ,
+                                            VECTOR_DECLARATION(b_fused)
+#endif // ifndef IN_PLACE_B
+#if defined(BETA)
+                                            ,
+                                            VECTOR_DECLARATION(beta)
+#endif // defined(BETA)
+#if defined(GAMMA)
+                                            ,
+                                            VECTOR_DECLARATION(gamma)
+#endif // defined(GAMMA)
+                                           )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+#if defined(DIM2)
+    int c0 = z % DIM2;
+    int c1 = z / DIM2;
+#else // ! defined(DIM2)
+    int c0                                                                                    = 0;
+#if defined(NHWC)
+    int c1                                                                                    = x;
+#else  // defined(NHWC)
+    int c1 = z;
+#endif // defined(NHWC)
+#endif // defined(DIM2)
+
+    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
+    int v_offset = c1 * sizeof(DATA_TYPE);
+
+    DATA_TYPE w_old = 0.0f;
+    DATA_TYPE b_old = 0.0f;
+    DATA_TYPE w_new = 0.0f;
+    DATA_TYPE b_new = 0.0f;
+    DATA_TYPE gamma = 1.0f;
+    DATA_TYPE mean  = 0.0f;
+    DATA_TYPE var   = 1.0f;
+    DATA_TYPE beta  = 0.0f;
+
+    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
+    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
+    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
+
+#if defined(GAMMA)
+    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
+#endif // defined(GAMMA)
+
+    // Compute new weight
+    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
+
+#if defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
+#else  // defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
+#endif // defined(IN_PLACE_W)
+
+    // Compute bias
+#if !defined(DIM2) && defined(NHWC)
+    if(z == 0 && y == 0)
+#else  // !defined(DIM2) && defined(NHWC)
+    if(x == 0 && y == 0 && c0 == 0)
+#endif // !defined(DIM2) && defined(NHWC)
+    {
+#if defined(BIAS)
+        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
+#endif // defined(BIAS)
+#if defined(BETA)
+        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
+#endif // defined(BETA)
+
+        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
+
+#if defined(BIAS)
+
+#if defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
+#else  // defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // defined(IN_PLACE_B)
+
+#else // defined(BIAS)
+
+#ifndef IN_PLACE_B
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // ifndef IN_PLACE_B
+
+#endif // defined(BIAS)
+    }
+}
+#endif // defined(DATA_TYPE) && defined(EPSILON)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/bitwise_op.cl b/src/core/CL/cl_kernels/common/bitwise_op.cl
new file mode 100644
index 0000000000..e142c1d275
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/bitwise_op.cl
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+/** This function computes the bitwise OR of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_or(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a | in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise AND of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_and(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a & in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise XOR of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_xor(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a ^ in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise NOT of an images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_not(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = ~in_a;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/bounding_box_transform.cl b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
new file mode 100644
index 0000000000..f2e9cb0ed0
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) // Check for compile time constants
+
+/** Transform proposal bounding boxes to target bounding box using bounding box deltas.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: F16/F32
+ * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: same as @p in_ptr
+ * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform(
+    VECTOR_DECLARATION(boxes),
+    IMAGE_DECLARATION(pred_boxes),
+    IMAGE_DECLARATION(deltas))
+{
+    // Get pixels pointer
+    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+    // Load delta and box values into registers
+    const DATA_TYPE one     = (DATA_TYPE)1.f;
+    const DATA_TYPE halfone = (DATA_TYPE)0.5f;
+
+    const int py = get_global_id(1); // box
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    scale_before = (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_BEFORE;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    delta = vload4(0, (__global DATA_TYPE *)deltas.ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    box = vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)) / scale_before;
+
+    // Calculate width and centers of the old boxes
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    dims = box.s23 - box.s01 + one;
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    ctr = box.s01 + halfone * dims;
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    weights = (VEC_DATA_TYPE(DATA_TYPE, 4))(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+    delta /= weights;
+    delta.s23 = min(delta.s23, (DATA_TYPE)BBOX_XFORM_CLIP);
+
+    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    pred_ctr = delta.s01 * dims + ctr;
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    pred_dims = exp(delta.s23) * dims;
+
+    // Useful vector constant definitions
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_values = (VEC_DATA_TYPE(DATA_TYPE, 4))(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    sign = (VEC_DATA_TYPE(DATA_TYPE, 4))(-1, -1, 1, 1);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    min_values = 0;
+
+    // Calculate the coordinates of the new boxes
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+    pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+    pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+    pred_box *= (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+    // Store them into the output
+    vstore4(pred_box, 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+
+#endif // defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE)
diff --git a/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
new file mode 100644
index 0000000000..c1d45a56b9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if defined(DATA_TYPE) && defined(DATA_TYPE_DELTAS) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) && defined(OFFSET_BOXES) && defined(SCALE_BOXES) && defined(OFFSET_DELTAS) && defined(SCALE_DELTAS) && defined(OFFSET_PRED_BOXES) && defined(SCALE_PRED_BOXES) // Check for compile time constants
+
+/** Transform proposal bounding boxes to target bounding box using bounding box deltas for quantized data types.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM16 for boxes and pred_boxes, QASYMM8 for for deltas
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: QASYMM16
+ * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: QASYMM8
+ * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform_quantized(
+    VECTOR_DECLARATION(boxes),
+    IMAGE_DECLARATION(pred_boxes),
+    IMAGE_DECLARATION(deltas))
+{
+    // Get pixels pointer
+    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+    // Load delta and box values into registers
+    const float one     = 1.f;
+    const float halfone = 0.5f;
+
+    const int py           = get_global_id(1); // box
+    float4    scale_before = (float4)SCALE_BEFORE;
+    float4 delta           = DEQUANTIZE(vload4(0, (__global DATA_TYPE_DELTAS *)deltas.ptr), OFFSET_DELTAS, SCALE_DELTAS, DATA_TYPE_DELTAS, 4);
+    float4 box             = DEQUANTIZE(vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)), OFFSET_BOXES, SCALE_BOXES, DATA_TYPE, 4) / scale_before;
+
+    // Calculate width and centers of the old boxes
+    float2 dims    = box.s23 - box.s01 + one;
+    float2 ctr     = box.s01 + halfone * dims;
+    float4 weights = (float4)(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+    delta /= weights;
+    delta.s23 = min(delta.s23, (float)BBOX_XFORM_CLIP);
+
+    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+    float2 pred_ctr  = delta.s01 * dims + ctr;
+    float2 pred_dims = exp(delta.s23) * dims;
+
+    // Useful vector constant definitions
+    float4 max_values = (float4)(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+    float4 sign       = (float4)(-1, -1, 1, 1);
+    float4 min_values = 0;
+
+    // Calculate the coordinates of the new boxes
+    float4 pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+    pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+    pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+    pred_box *= (float4)SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+    // Store them into the output
+    vstore4(QUANTIZE(pred_box, OFFSET_PRED_BOXES, SCALE_PRED_BOXES, DATA_TYPE, 4), 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/cast.cl b/src/core/CL/cl_kernels/common/cast.cl
new file mode 100644
index 0000000000..036a683ec7
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/cast.cl
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
+#else /* SATURATE */
+#define CONVERT_DOWN(x, type) CONVERT(x, type)
+#endif /* SATURATE */
+
+#define CONVERT_UP(x, type) CONVERT(x, type)
+
+/** This function performs a down-casting
+ *
+ * @attention For QSYMM8_PER_CHANNEL -> QASYMM8, it is user's responsibility to keep track of the quantization info.
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast_down(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *in_addr  = in_ptr + in_offset_first_element_in_bytes + sizeof(DATA_TYPE_IN) * x_offs + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + sizeof(DATA_TYPE_OUT) * x_offs + get_global_id(1) * out_stride_y + get_global_id(2) * out_stride_z;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
+
+#if defined(IS_DATA_TYPE_QUANTIZED)
+    in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE))0x80;
+#endif // defined(IS_DATA_TYPE_QUANTIZED)
+
+#if defined(IS_DATA_TYPE_FLOAT)
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(IS_DATA_TYPE_FLOAT) */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(IS_DATA_TYPE_FLOAT) */
+}
+
+/** This function performs a up-casting
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast_up(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *in_addr  = in_ptr + in_offset_first_element_in_bytes + sizeof(DATA_TYPE_IN) * x_offs + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + sizeof(DATA_TYPE_OUT) * x_offs + get_global_id(1) * out_stride_y + get_global_id(2) * out_stride_z;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
+
+#if defined(IS_DATA_TYPE_FLOAT)
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(IS_DATA_TYPE_FLOAT) */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(IS_DATA_TYPE_FLOAT) */
+}
diff --git a/src/core/CL/cl_kernels/common/col2im.cl b/src/core/CL/cl_kernels/common/col2im.cl
new file mode 100644
index 0000000000..4dc005fd43
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/col2im.cl
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+/** This kernel performs a reshaping of the output of the convolution layer
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
+ * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
+ * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
+ * @note The number of groups must be passed at compile time using  -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void col2im(
+    TENSOR3D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst);
+
+    const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
+    const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    uint  x         = get_global_id(0) * 8;
+    uint8 x_clamped = x + (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    cond0 = CONVERT((x_clamped < WIDTH_INPUT), VEC_DATA_TYPE(COND_DATA_TYPE, 8));
+
+    // Clamp x if out-of-bounds
+    x_clamped = select((uint8)x, x_clamped, convert_int8(cond0));
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
+
+#if NUM_GROUPS > 1
+    // Compute output offset (batches on 4th dimension)
+    int idx = yd * dst_stride_y + xd * dst_stride_x + (get_global_id(2) / NUM_GROUPS) * dst.stride_w;
+
+    const uint group = get_global_id(2) % NUM_GROUPS; // group ID
+    x_clamped += group * WIDTH_INPUT;
+#else  /* defined(NUM_GROUPS > 1 ) */
+    // Compute output offset (batches on 3rd dimension)
+    int idx = yd * dst.stride_y + xd * dst.stride_x + get_global_id(2) * dst.stride_w;
+#endif /* NUM_GROUPS > 1 */
+
+    // Store value
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s0 * dst.stride_z)) = data.s0;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s1 * dst.stride_z)) = data.s1;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s2 * dst.stride_z)) = data.s2;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s3 * dst.stride_z)) = data.s3;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s4 * dst.stride_z)) = data.s4;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s5 * dst.stride_z)) = data.s5;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s6 * dst.stride_z)) = data.s6;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s7 * dst.stride_z)) = data.s7;
+}
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
diff --git a/src/core/CL/cl_kernels/common/comparisons.cl b/src/core/CL/cl_kernels/common/comparisons.cl
new file mode 100644
index 0000000000..00bb491f85
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/comparisons.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define EQUAL(x, y) ((x) == (y))
+#define NOTEQUAL(x, y) ((x) != (y))
+#define GREATER(x, y) ((x) > (y))
+#define GREATEREQUAL(x, y) ((x) >= (y))
+#define LESS(x, y) ((x) < (y))
+#define LESSEQUAL(x, y) ((x) <= (y))
+
+#ifdef IS_QUANTIZED
+#  define DEFINE_KERNEL_STR(name) compare_##name##_quantized
+#else // IS_QUANTIZED
+#  define DEFINE_KERNEL_STR(name) compare_##name
+#endif // IS_QUANTIZED
+
+#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
+/** This function compares two tensors.
+ *
+ * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void DEFINE_KERNEL(OP_NAME)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    int dst_x = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+
+#if VEC_SIZE_IN1 == 1
+    int in1_x = 0;
+#else // VEC_SIZE_IN1 == 1
+    int in1_x = dst_x;
+#endif // VEC_SIZE_IN1 == 1
+
+#if VEC_SIZE_IN2 == 1
+    int in2_x = 0;
+#else // VEC_SIZE_IN2 == 1
+    int in2_x = dst_x;
+#endif // VEC_SIZE_IN2 == 1
+
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    in1_ptr += in1_offset_first_element_in_bytes + z * in1_stride_z + y * in1_stride_y + in1_x * sizeof(DATA_TYPE);
+    in2_ptr += in2_offset_first_element_in_bytes + z * in2_stride_z + y * in2_stride_y + in2_x * sizeof(DATA_TYPE);
+    out_ptr += out_offset_first_element_in_bytes + z * out_stride_z + y * out_stride_y + dst_x * sizeof(uchar);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_a = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_b = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_ptr);
+
+    // Calculate and store result
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(int, VEC_SIZE) in_a_i32 = CONVERT(in_a, VEC_DATA_TYPE(int, VEC_SIZE));
+    VEC_DATA_TYPE(int, VEC_SIZE) in_b_i32 = CONVERT(in_b, VEC_DATA_TYPE(int, VEC_SIZE));
+
+    VEC_DATA_TYPE(float, VEC_SIZE) in_a_fp = CONVERT(in_a_i32 - OFFSET_IN1, VEC_DATA_TYPE(float, VEC_SIZE)) * SCALE_IN1;
+    VEC_DATA_TYPE(float, VEC_SIZE) in_b_fp = CONVERT(in_b_i32 - OFFSET_IN2, VEC_DATA_TYPE(float, VEC_SIZE)) * SCALE_IN2;
+#else // IS_QUANTIZED
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_a_fp = in_a;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) in_b_fp = in_b;
+#endif // IS_QUANTIZED
+
+#if VEC_SIZE == 1
+    uchar res0 = (uchar)select(0, 255, OP(in_a_fp, in_b_fp));
+#else // VEC_SIZE == 1
+    VEC_DATA_TYPE(uchar, VEC_SIZE) res0 = CONVERT(OP(in_a_fp, in_b_fp), VEC_DATA_TYPE(uchar, VEC_SIZE));
+#endif // VEC_SIZE == 1
+
+    STORE_VECTOR_SELECT(res, uchar, out_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
diff --git a/src/core/CL/cl_kernels/common/concatenate.cl b/src/core/CL/cl_kernels/common/concatenate.cl
new file mode 100644
index 0000000000..dc7210a4c4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/concatenate.cl
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
+{
+    const VEC_FLOAT in_f32  = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
+    const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
+    const VEC_QUANT res_q8  = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
+    return res_q8;
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(DATA_TYPE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(ELEMENT_SIZE)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SEQ VEC_OFFS(int, VEC_SIZE)
+
+#if defined(CONCATENATE_WIDTH_X2)
+/** This kernel concatenates two input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All.
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                              Tensor depth
+ * @param[in]  INPUT1_WIDTH                       First input tensor width
+ */
+__kernel void concatenate_width_x2(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH,
+    const int INPUT1_WIDTH)
+{
+    // Calculate input indices
+    const int x  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y  = get_global_id(1);
+    const int z  = get_global_id(2) % (int)DEPTH;
+    const int w  = get_global_id(2) / (int)DEPTH;
+    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
+    const int x2 = max(x - (int)INPUT1_WIDTH, 0);
+
+    // Calculate inputs and output addresses
+    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
+    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+
+    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
+    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1)  && defined(SCALE_IN2) && defined(SCALE_OUT) */
+    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
+
+    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
+    SELECT_TYPE cond = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    src1_values      = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
+    src2_values      = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
+
+    cond                   = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
+    const VEC_TYPE values0 = select(src2_values, src1_values, cond);
+
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(CONCATENATE_WIDTH_X2)
+
+#if defined(CONCATENATE_WIDTH_X4)
+/** This kernel concatenates four input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src3_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src3_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src3_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src3_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src3_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src3_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src4_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src4_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src4_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src4_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src4_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src4_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                              Tensor depth
+ * @param[in]  INPUT1_WIDTH                       First input tensor width
+ * @param[in]  INPUT2_WIDTH                       Second input tensor width
+ * @param[in]  INPUT3_WIDTH                       Third input tensor width
+ */
+__kernel void concatenate_width_x4(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(src3),
+    TENSOR4D_DECLARATION(src4),
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH,
+    const int INPUT1_WIDTH,
+    const int INPUT2_WIDTH,
+    const int INPUT3_WIDTH)
+{
+    // Calculate input indices
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
+    const int x2 = min(max(x - (int)INPUT1_WIDTH, 0), (int)INPUT2_WIDTH - (int)VEC_SIZE);
+    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, 0), (int)INPUT3_WIDTH - (int)VEC_SIZE);
+    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, 0);
+
+    // Calculate inputs and output addresses
+    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
+    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+    const __global uchar *src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+    const __global uchar *src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
+    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
+    VEC_TYPE src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
+    VEC_TYPE src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+    src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
+    src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
+
+    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
+
+    SELECT_TYPE cond_in2 = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    SELECT_TYPE cond_in3 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    SELECT_TYPE cond_in4 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH - VEC_SIZE)), SELECT_TYPE);
+
+    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
+    src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
+    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
+    // Rotate src2/3_values, if values0 is a combination of src2_values and src3_values.
+    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
+    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
+    // Rotate src3/4_values, if values0 is a combination of src3_values and src4_values.
+    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
+    src4_values = select(src4_values, ROTATE(src4_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
+
+    cond_in2 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
+    cond_in3 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
+    cond_in4 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
+
+    VEC_TYPE values0 = select(src2_values, src1_values, cond_in2);
+    values0          = select(src3_values, values0, cond_in3);
+    values0          = select(src4_values, values0, cond_in4);
+
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(CONCATENATE_WIDTH_X4) */
+#endif /* defined(ELEMENT_SIZE) */
+
+#if defined(WIDTH_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+#if defined(CONCATENATE_WIDTH)
+/** This kernel concatenates the input tensor into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                             Tensor depth
+ */
+__kernel void concatenate_width(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH)
+{
+    // Calculate input indices
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z + w * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + w * dst_stride_w;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+#endif /* defined(CONCATENATE_WIDTH) */
+#endif /* defined(WIDTH_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
+
+#if defined(VEC_SIZE_LEFTOVER)
+#if defined(CONCATENATE_HEIGHT)
+#if defined(HEIGHT_OFFSET) && defined(VEC_SIZE)
+/** This kernel concatenates the input tensor into the output tensor along the second dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Vector sizes supported are 2,4,8 and 16.
+ * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  DEPTH                             Tensor depth
+ */
+__kernel void concatenate_height(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    const int DEPTH)
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
+                                   2) / DEPTH) * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
+                                   2) / DEPTH) * dst_stride_w;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+#endif /* defined(CONCATENATE_HEIGHT) */
+#endif /* defined(HEIGHT_OFFSET) */
+
+#if defined(CONCATENATE)
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offsets                           The offsets to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    int offset)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(CONCATENATE)
+#endif /* defined(VEC_SIZE_LEFTOVER) */
+#endif /* defined(DATA_TYPE) */
+#endif /* defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/common/convert_fc_weights.cl b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
new file mode 100644
index 0000000000..01ef04a7d6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
+/** Perform a NCHW -> NHWC or NHWC -> NCHW conversion for Fully Connected 2D weights.
+ *
+ * For NCHW -> NHWC, FACTOR_1 will be equal to the product of the first two dimensions of FullyConnectedLayer's input and FACTOR_2 will represent the number of channels of that tensor.
+ * For NHWC -> NCHW, FACTOR_1 and FACTOR_2 will hold the same values, but swapped.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convert_fc_weights(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_x + (get_global_id(1) % FACTOR_1 * FACTOR_2 + get_global_id(1) / FACTOR_1) * dst_stride_y;
+
+    *((__global DATA_TYPE *)dst_addr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
diff --git a/src/core/CL/cl_kernels/common/convolution_layer.cl b/src/core/CL/cl_kernels/common/convolution_layer.cl
new file mode 100644
index 0000000000..be76929ac8
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/convolution_layer.cl
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(NUM_GROUPS)
+/** This kernel reshapes the tensor's low three dimensions to single column
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=number. e.g. -DNUM_GROUPS=2
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  bias_ptr                           Pointer to the bias tensor. Supported data types: F16/F32, for quantized types this must be nullptr
+ * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  width                              The width of the input tensor
+ * @param[in]  height                             The height of the input tensor
+ * @param[in]  depth                              The depth of the input tensor
+ * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ */
+__kernel void reshape_to_columns(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(bias),
+#endif /* HAS_BIAS */
+    uint width, uint height, uint depth, uint total_filters, uint dst_stride_z)
+{
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
+
+    __global uchar *tmp_src_ptr = src.ptr;
+    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+                                      2) * width * height * dst_stride_y;
+#ifdef HAS_BIAS
+    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif /* HAS_BIAS */
+
+    if(is_last_thread)
+    {
+        for(uint g = 0; g < NUM_GROUPS; ++g)
+        {
+            __global uchar *curr_group_dst = tmp_dst_ptr;
+
+            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
+            {
+                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
+
+#ifdef HAS_BIAS
+                *((__global DATA_TYPE *)(curr_group_dst + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+                tmp_bias_ptr += bias_stride_x;
+#endif /* HAS_BIAS */
+                tmp_src_ptr += depth * src_stride_z;
+                curr_group_dst += dst_stride_x;
+            }
+
+            tmp_dst_ptr += dst_stride_z;
+        }
+    }
+    else
+    {
+        for(uint g = 0; g < NUM_GROUPS; ++g)
+        {
+            __global uchar *curr_group_dst = tmp_dst_ptr;
+
+            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
+            {
+                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
+                tmp_src_ptr += depth * src_stride_z;
+                curr_group_dst += dst_stride_x;
+            }
+
+            tmp_dst_ptr += dst_stride_z;
+        }
+    }
+}
+#endif // defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/copy_tensor.cl b/src/core/CL/cl_kernels/common/copy_tensor.cl
new file mode 100644
index 0000000000..753b98d1b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/copy_tensor.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DDATA_TYPE        : Input and output datatypes.
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void copy_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Boundary-aware access:
+    // If the there's left-over in width (VEC_SIZE_LEFTOVER > 0):
+    // Shift all accesses other than the first to avoid accessing out of bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)VEC_SIZE_LEFTOVER, 0) % VEC_SIZE;
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, DATA_TYPE, (__global DATA_TYPE *)out.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/crop_tensor.cl b/src/core/CL/cl_kernels/common/crop_tensor.cl
new file mode 100644
index 0000000000..d9090dc838
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/crop_tensor.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) // Compile time constants
+
+/** Performs a tensor cropping.
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  in_offset_y                       The initial offset of the input address along Y.
+ * @param[in]  in_offset_z                       The initial offset of the input address along Z.
+ */
+__kernel void crop_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
+    int in_offset_y,
+    int in_offset_z)
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    const int in_x = get_global_id(0) * (in_step_x / in_stride_x);
+
+#if defined(WIDTH_FLIPPED)
+    const int in_y = in_offset_y - get_global_id(1);
+#else  // defined(WIDTH_FLIPPED)
+    const int in_y                 = in_offset_y + get_global_id(1);
+#endif // defined(WIDTH_FLIPPED)
+
+#if defined(HEIGHT_FLIPPED)
+    const int in_z = in_offset_z - get_global_id(2);
+#else  // defined(HEIGHT_FLIPPED)
+    const int in_z                 = in_offset_z + get_global_id(2);
+#endif // defined(HEIGHT_FLIPPED)
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does then shift access vector to access elements within bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    __global const uchar *input_addr = tensor3D_offset(&in, in_x, in_y, in_z);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)), 0, (__global float *)out.ptr);
+#else  // defined(VEC_SIZE)
+    *((__global float *)(out.ptr)) = CONVERT(*((__global DATA_TYPE *)tensor3D_offset(&in, in_x, in_y, in_z)), float);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // defined(DATA_TYPE) && defined(LAST_ACCESSED_X)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/deconvolution_layer.cl b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
new file mode 100644
index 0000000000..4ac5e3f0e9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void deconvolution_upsample(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Store result
+    *((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+
+#if defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+/** This kernel reshapes the deconvolution output tensor before returning the result of the Deconvolution. The decovnolution output tensor
+ * is the result of a @ref CLGEMM operation between the deconvolution input and the deconvolution filter
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g., -DDATA_TYPE=F32
+ * @note The width of the filter should be given as a preprocessor argument using -DFILTER_WIDTH=width, e.g., -DFILTER_WIDTH=2
+ * @note The height of the filter should be given as a preprocessor argument using -DFILTER_HEIGHT=height, e.g., -DFILTER_HEIGHT=2
+ * @note The width of the input should be given as a preprocessor argument using -DSRC_WIDTH=width, e.g., -DSRC_WIDTH=10
+ * @note The height of the input should be given as a preprocessor argument using -DSRC_HEIGHT=width, e.g., -DSRC_HEIGHT=10
+ * @note The output data layout is NHWC if the preprocessor argument NUM_FILTERS is defined, NCHW if NUM_FILTERS is not defined
+ *
+ * @param[in]  src_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  bias_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32/S32
+ * @param[in]  bias_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+__kernel void deconvolution_reshape(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(ADD_BIAS)
+)
+{
+#define FILTER_AREA ((FILTER_WIDTH) * (FILTER_HEIGHT))
+
+    Tensor3D        src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D        dst  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+    const DATA_TYPE data = *(__global DATA_TYPE *)src.ptr;
+
+    // Store result
+    const int x_in = get_global_id(0);
+    const int y_in = get_global_id(1);
+    const int z_in = get_global_id(2);
+
+#if defined(NUM_FILTERS)
+    const int bias_index = x_in / (FILTER_AREA);
+    const int z_out      = bias_index + (NUM_FILTERS) * (z_in / (SRC_HEIGHT));
+    const int x_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int y_out      = (FILTER_HEIGHT) * (z_in % (SRC_HEIGHT)) + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+#else  // defined(NUM_FILTERS)
+    const int x_out      = x_in / (FILTER_AREA);
+    const int y_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int z_out      = (FILTER_HEIGHT) * z_in + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+    const int bias_index = x_out;
+#endif // defined(NUM_FILTERS)
+
+#if defined(ADD_BIAS)
+    Vector          bias     = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+    const DATA_TYPE bias_val = *(__global DATA_TYPE *)vector_offset(&bias, bias_index);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data + bias_val;
+#else  // defined(ADD_BIAS)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data;
+#endif // defined(ADD_BIAS)
+
+#undef FILTER_AREA
+}
+#endif // defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/dequantization_layer.cl b/src/core/CL/cl_kernels/common/dequantization_layer.cl
new file mode 100644
index 0000000000..7fa62577ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/dequantization_layer.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void dequantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = SCALE;
+
+    const VEC_DATA_TYPE(int, VEC_SIZE)
+    voffset = OFFSET;
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/elementwise_operation.cl b/src/core/CL/cl_kernels/common/elementwise_operation.cl
new file mode 100644
index 0000000000..91e51d9d1a
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_operation.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE)
+
+/** List of all the operations supported by this kernel.
+ * @note ADD and SUB operations, when executed on integers, support saturation */
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+#define MAX(x, y) max(x, y)
+#define MIN(x, y) min(x, y)
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define POWER(x, y) pow(x, y)
+
+#if VEC_SIZE_OUT == 1
+#define PRELU(x, y) (x > 0 ? x : x * y)
+#else // VEC_SIZE_OUT == 1
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))))
+#endif // VEC_SIZE_OUT == 1
+
+#define DIV(x, y) (x / y)
+
+#define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
+#define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+#endif // defined(ACTIVATION_TYPE)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_OUT=3. It is defined as the remainder between the input's first dimension and VEC_SIZE_OUT
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2)
+#if !defined(IN_PLACE)
+    ,
+    TENSOR3D_DECLARATION(out)
+#endif // !defined(IN_PLACE)
+)
+{
+#if VEC_SIZE_IN1 == 1
+    uint in1_x_offs = 0;
+#else  // VEC_SIZE_IN1 == 1
+    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
+#endif // VEC_SIZE_IN1 == 1
+#if VEC_SIZE_IN2 == 1
+    uint in2_x_offs = 0;
+#else  // VEC_SIZE_IN2 == 1
+    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
+#endif // VEC_SIZE_IN2 == 1
+#if !defined(IN_PLACE)
+    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+#endif // !defined(IN_PLACE)
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr    = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
+
+    // Calculate and store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    res0 = OP(in_a, in_b);
+#if defined(ACTIVATION_TYPE)
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
new file mode 100644
index 0000000000..a11be80875
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define SUB(x, y) (x - y)
+#define ADD(x, y) (x + y)
+#define MAX(x, y) max((x), (y))
+#define MIN(x, y) min((x), (y))
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
+#define DIV(x, y) (x / y)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note In case of broadcasting along the X dimension the proper preprocessor argument should be passed depending on the input (e.g. -DIS_IN1_X_BROADCASTING, -DIS_IN2_X_BROADCASTING)
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ * @note For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
+ * @note The data type must be passed at compile time using -DDATA_TYPE, i.e. -DDATA_TYPE=uchar
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2)
+#if !defined(IN_PLACE)
+    ,
+    TENSOR3D_DECLARATION(out)
+#endif // !defined(IN_PLACE)
+)
+{
+#if VEC_SIZE_IN1 == 1
+    uint in1_x_offs = 0;
+#else  // VEC_SIZE_IN1 == 1
+    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
+#endif // VEC_SIZE_IN1 == 1
+#if VEC_SIZE_IN2 == 1
+    uint in2_x_offs = 0;
+#else  // VEC_SIZE_IN2 == 1
+    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
+#endif // VEC_SIZE_IN2 == 1
+#if !defined(IN_PLACE)
+    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+#endif // !defined(IN_PLACE)
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr    = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+    const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+    const VEC_TYPE  res0    = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+
+    // Store result
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl
new file mode 100644
index 0000000000..81835108a3
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+
+// Calculate exponential
+#define exp_op(input) exp(input)
+// Calculate reverse square root
+#define rsqrt_op(input) rsqrt(input)
+// Calculate negative
+#define neg_op(input) (-input)
+// Calculate sine
+#define sin_op(input) sin(input)
+// Calculate abs for floating point values
+#define fabs_op(input) fabs(input)
+// Calculate natural_log
+#define natural_log_op(input) log(input)
+// Calculate round using round to nearest even rounding mode
+#define round_op(input) rint(input)
+
+#if defined(VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define logical_not_op(input) CONVERT(CONVERT(!input, VEC_TYPE) & ((VEC_TYPE)0x1), VEC_TYPE)
+#else // defined(VEC_SIZE)
+#define logical_not_op(input) ((!input) & 0x1)
+#endif // defined(VEC_SIZE)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    VSTORE(VEC_SIZE)
+    (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl
new file mode 100644
index 0000000000..2e4cdc53fe
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+// Calculate reverse square root
+#define rsqrt_op(input) rsqrt(input)
+#if defined(VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#endif // defined(VEC_SIZE)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED.
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: QASYMM8/QASYMM8_SIGNED.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary_quantized(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_f32                = CONVERT(data, VEC_FLOAT);
+    data_f32                = (data_f32 - (float)OFFSET_IN) * (float)SCALE_IN;
+    VEC_INT        qres_int = CONVERT_SAT((OPERATION(data_f32) / ((VEC_FLOAT)(float)SCALE_OUT)), VEC_INT) + ((VEC_INT)((int)OFFSET_OUT));
+    const VEC_TYPE qres     = CONVERT_SAT(qres_int, VEC_TYPE);
+    VSTORE(VEC_SIZE)
+    (qres, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/common/fft.cl b/src/core/CL/cl_kernels/common/fft.cl
new file mode 100644
index 0000000000..3f26d0f1a6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft.cl
@@ -0,0 +1,1880 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE)
+/** Calculates and applies the twiddle factor to a given input.
+ *
+ * @param[in]     phi   The angle.
+ * @param[in,out] input The input on which the factor should be applied.
+ */
+#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input)  \
+    {                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                \
+        w, tmp;                                    \
+        w.x   = cos(phi);                          \
+        w.y   = sin(phi);                          \
+        tmp.x = (w.x * input.x) - (w.y * input.y); \
+        tmp.y = (w.x * input.y) + (w.y * input.x); \
+        input = tmp;                               \
+    }
+
+/** Computes radix-2 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ */
+#define DFT_2(c0, c1)               \
+    {                               \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        v0;                         \
+        v0 = c0;                    \
+        c0 = v0 + c1;               \
+        c1 = v0 - c1;               \
+    }
+
+// radix-3 butterfly unit factors
+#define SQRT3DIV2 0.86602540378443f
+
+/** Computes radix-3 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ */
+#define DFT_3(c0, c1, c2)                             \
+    {                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
+        v0 = c1 + c2;                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
+        v1   = c1 - c2;                               \
+        c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
+        c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
+        c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
+        c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
+        c0   = c0 + v0;                               \
+    }
+
+/**Computes radix-4 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ */
+#define DFT_4(c0, c1, c2, c3)       \
+    {                               \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        v0, v1, v2, v3;             \
+        v0   = c0 + c2;             \
+        v1   = c1 + c3;             \
+        v2   = c0 - c2;             \
+        v3.x = c1.y - c3.y;         \
+        v3.y = c3.x - c1.x;         \
+        c0   = v0 + v1;             \
+        c2   = v0 - v1;             \
+        c1   = v2 + v3;             \
+        c3   = v2 - v3;             \
+    }
+
+// radix-5 butterfly unit factors
+#define W5_A (DATA_TYPE)0.30901699437494f
+#define W5_B (DATA_TYPE)0.95105651629515f
+#define W5_C (DATA_TYPE)0.80901699437494f
+#define W5_D (DATA_TYPE)0.58778525229247f
+
+/** Computes radix-5 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ */
+#define DFT_5(c0, c1, c2, c3, c4)                                  \
+    {                                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                \
+        v0, v1, v2, v3, v4;                                        \
+        v0 = c0;                                                   \
+        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3);                  \
+        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3);                  \
+        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3);                  \
+        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3);                  \
+        c0 = v0 + c1 + c2 + c3 + c4;                               \
+        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \
+        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v3.y, -v3.x); \
+        c3 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v3.y, v3.x); \
+        c4 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \
+    }
+
+// radix-7 butterfly unit factors
+#define W7_A (DATA_TYPE)0.62348980185873f
+#define W7_B (DATA_TYPE)0.78183148246802f
+#define W7_C (DATA_TYPE)0.22252093395631f
+#define W7_D (DATA_TYPE)0.97492791218182f
+#define W7_E (DATA_TYPE)0.90096886790241f
+#define W7_F (DATA_TYPE)0.43388373911755f
+
+/** Computes radix-7 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ */
+#define DFT_7(c0, c1, c2, c3, c4, c5, c6)                            \
+    {                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                  \
+        v0, v1, v2, v3, v4, v5, v6;                                  \
+        v0 = c0;                                                     \
+        v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
+        v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
+        v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
+        v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
+        v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
+        v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
+        c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6;                       \
+        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x);   \
+        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v5.y, -v5.x);   \
+        c3 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v6.y, -v6.x);   \
+        c4 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v6.y, v6.x);   \
+        c5 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v5.y, v5.x);   \
+        c6 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x);   \
+    }
+
+/** Computes radix-8 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ * @param[in,out] c7 Complex input 7.
+ */
+#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
+    {                                         \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        v0, v1, v2, v3, v4, v5, v6, v7;       \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        s0, s1, s2, s3, s4, s5, s6, s7;       \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        t0, t1, t2;                           \
+        v0   = c0 + c4;                       \
+        v1   = c1 + c5;                       \
+        v2   = c2 + c6;                       \
+        v3   = c3 + c7;                       \
+        v4   = c0 - c4;                       \
+        v5   = c1 - c5;                       \
+        v6   = c2 - c6;                       \
+        v7   = c3 - c7;                       \
+        s0   = v0 + v2;                       \
+        s1   = v1 + v3;                       \
+        s2   = v0 - v2;                       \
+        s3   = v1 - v3;                       \
+        s4.x = v4.x - v6.y;                   \
+        s4.y = v4.y + v6.x;                   \
+        s5.x = v5.x - v7.y;                   \
+        s5.y = v5.y + v7.x;                   \
+        s6.x = v4.x + v6.y;                   \
+        s6.y = v4.y - v6.x;                   \
+        s7.x = v5.x + v7.y;                   \
+        s7.y = v5.y - v7.x;                   \
+        t0.x = -s3.y;                         \
+        t0.y = s3.x;                          \
+        t1.x = M_SQRT1_2_F * (s5.x - s5.y);   \
+        t1.y = M_SQRT1_2_F * (s5.x + s5.y);   \
+        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);  \
+        t2.y = M_SQRT1_2_F * (s7.x - s7.y);   \
+        c0   = s0 + s1;                       \
+        c1   = s6 - t2;                       \
+        c2   = s2 - t0;                       \
+        c3   = s4 - t1;                       \
+        c4   = s0 - s1;                       \
+        c5   = s6 + t2;                       \
+        c6   = s2 + t0;                       \
+        c7   = s4 + t1;                       \
+    }
+
+/** Computes the first stage of a radix-2 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_2_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data = vload4(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 2
+    DFT_2(data.s01, data.s23);
+
+    // Store two complex output values
+    vstore4(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-2 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_2_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Compute DFT N = 2
+    DFT_2(data1, data2);
+
+    // Store two complex output values
+    vstore2(data1, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_3_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data0 = vload4(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2, 0, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0.s01, data0.s23, data1.s01);
+
+    // Store three complex output values
+    vstore4(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2, 0, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_3_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0, data1, data2);
+
+    // Store three complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_4_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 4
+    DFT_4(data.s01, data.s23, data.s45, data.s67);
+
+    // Store four complex output values
+    vstore8(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_4_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+
+    // Compute DFT N = 4
+    DFT_4(data0, data1, data2, data3);
+
+    // Store four complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_5_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
+
+    // Store five complex output values
+    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_5_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0, data1, data2, data3, data4);
+
+    // Store five complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_7_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data1 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6, 0, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
+
+    // Store seven complex output values
+    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore4(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6, 0, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_7_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0, data1, data2, data3, data4, data5, data6);
+
+    // Store seven complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_8_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 8
+    DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
+
+    // Store eight complex output values
+    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_8_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7, 0));
+
+    // Compute DFT N = 8
+    DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
+
+    // Store eight complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
+    vstore2(data7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_2_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_2_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_3_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_3_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_4_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_4_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_5_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_5_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_7_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_7_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_8_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 7 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 7 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_8_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7 * Nx, 0));
+}
+#endif // defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fft_digit_reverse.cl b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
new file mode 100644
index 0000000000..5f64d95bf9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+/** Computes the digit reverse stage on axis X
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_0(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT(idx);
+
+    const unsigned int iidx = *((__global uint *)(idx.ptr));
+
+    // Load data
+#if VEC_SIZE == 1
+    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { data, 0 };
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+}
+
+/** Computes the digit reverse stage on axis Y
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT_NO_STEP(idx);
+
+    const unsigned int iidx = *((__global uint *)vector_offset(&idx, (int)(get_global_id(1))));
+
+    // Load data
+#if VEC_SIZE == 1
+    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { data, 0 };
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fft_scale.cl b/src/core/CL/cl_kernels/common/fft_scale.cl
new file mode 100644
index 0000000000..c799dd3b9e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft_scale.cl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+/** Computes the fft scale stage
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        (Optional) dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        (Optional) dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        (Optional) dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ * @param[in]  scale                             Scale to apply to the complex value
+ */
+__kernel void fft_scale_conj(
+    TENSOR3D_DECLARATION(src)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(dst)
+#endif /* not IN_PLACE */
+    ,
+    float scale)
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+#if defined(IN_PLACE)
+    Tensor3D dst = src;
+#else  /* IN_PLACE */
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#endif /* IN_PLACE */
+
+    // Store result
+#if VEC_SIZE == 1
+    *((__global DATA_TYPE *)dst.ptr) = (*(__global DATA_TYPE *)src.ptr) / (DATA_TYPE)scale;
+#elif VEC_SIZE == 2
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)src.ptr);
+    data /= (DATA_TYPE)scale;
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(data.s0, -data.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(data, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+#else  // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fill_border.cl b/src/core/CL/cl_kernels/common/fill_border.cl
new file mode 100644
index 0000000000..a43343c9f4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fill_border.cl
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
+ */
+__kernel void fill_image_borders_replicate(
+    TENSOR3D_DECLARATION(buf),
+    uint width,
+    uint height,
+    int2 start_pos)
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
+        }
+        // Handle right border
+        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (width - 1))
+        {
+            val_idx = gidW < 0 ? 0 : width - 1;
+        }
+
+        // Handle top border
+        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
+        }
+        // Handle bottom border
+        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
+        }
+    }
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]  buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+__kernel void fill_image_borders_constant(
+    TENSOR3D_DECLARATION(buf),
+    uint      width,
+    uint      height,
+    int2      start_pos,
+    DATA_TYPE constant_value)
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/common/floor.cl b/src/core/CL/cl_kernels/common/floor.cl
new file mode 100644
index 0000000000..f6dd4edd2e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/floor.cl
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+/** Perform a floor operation on an input tensor.
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void floor_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data0 = floor(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr));
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gather.cl b/src/core/CL/cl_kernels/common/gather.cl
new file mode 100644
index 0000000000..e16f4bf315
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gather.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
+ * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: S32/U32.
+ * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
+ * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination tensor
+ */
+__kernel void gather(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(indices),
+    TENSOR4D_DECLARATION(output))
+{
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+    const int pw = (get_global_id(2) / OUTPUT_DIM_Z );
+
+    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    const Tensor4D indices = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(indices);
+    Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, py, pz, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, pz, pw, 0);
+#elif INDICES_DIMS == 3
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, pz, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, pw, 0, 0);
+#elif INDICES_DIMS == 4
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, px, py, pz, pw);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, safe_index, 0, 0, 0);
+#endif //INDICES_DIMS
+
+#elif AXIS == 1
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, pz, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, pz, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, pw, 0);
+#elif INDICES_DIMS == 3
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, py, pz, pw, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, safe_index, 0, 0);
+#endif //INDICES_DIMS
+
+#elif AXIS == 2
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pz, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, py, safe_index, pw);
+#elif INDICES_DIMS == 2
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pz, pw, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, safe_index, 0);
+#endif //INDICES_DIMS
+
+#elif AXIS == 3
+#if INDICES_DIMS == 1
+    const uint index                 = *(__global const uint *)tensor4D_offset(&indices, pw, 0, 0, 0);
+    const uint safe_index            = select((uint)0, index, index < INDEX_LIMIT);
+     __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, safe_index);
+#endif //INDICES_DIMS
+
+#endif //AXIS
+
+    *(__global DATA_TYPE *)output.ptr = select((DATA_TYPE)0, *((__global const DATA_TYPE *)input_addr), (DATA_TYPE)(index < INDEX_LIMIT));
+}
+
+#endif //defined(DATA_TYPE) && defined(AXIS)
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
new file mode 100644
index 0000000000..0c30c0e626
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -0,0 +1,3594 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+#define ARM_DOT2(a, b, c)       \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#define ARM_DOT3(a, b, c)           \
+    ({                              \
+        ARM_DOT2(a, b, c);          \
+        c = fma((a.s2), (b.s2), c); \
+    })
+#define ARM_DOT4(a, b, c)           \
+    ({                              \
+        ARM_DOT3(a, b, c);          \
+        c = fma((a.s3), (b.s3), c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##8), (c.s8));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##9), (c.s9));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##A), (c.sA));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##B), (c.sB));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##C), (c.sC));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##D), (c.sD));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##E), (c.sE));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##F), (c.sF));     \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+                                          IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                          IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                          IMAGE_DECLARATION(dst),
+                                          uint lhs_stride_z,
+                                          uint rhs_stride_z,
+#if defined(BETA)
+                                          uint bias_stride_z,
+#endif //defined(BETA)
+                                          uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                          ,
+                                          uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                          ,
+                                          uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                          ,
+                                          const int M,
+                                          const int N,
+                                          const int K)
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS reshaped matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS reshaped matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS reshaped matrix
+        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
+                                                  __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                  IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                  IMAGE_DECLARATION(dst),
+                                                  uint lhs_stride_z,
+                                                  uint rhs_stride_z,
+#if defined(BETA)
+                                                  uint bias_stride_z,
+#endif //defined(BETA)
+                                                  uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                  ,
+                                                  uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                  ,
+                                                  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                  ,
+                                                  const int M,
+                                                  const int N,
+                                                  const int K)
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+    const uint LEFTOVER_K = K % K0;
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    if(LEFTOVER_K != 0)
+    {
+        // Note: We cannot read out-of-bound elements from the RHS matrix because
+        // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+        // Left-over accumulations for LHS matrix
+
+        union UNION_VEC_TYPE
+        {
+            DATA_TYPE s[K0];
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            v;
+        };
+
+        union UNION_VEC_TYPE a0 = {.v = 0 };
+#if M0 > 1
+        union UNION_VEC_TYPE a1 = {.v = 0 };
+#endif // M0 > 1
+#if M0 > 2
+        union UNION_VEC_TYPE a2 = {.v = 0 };
+#endif // M0 > 2
+#if M0 > 3
+        union UNION_VEC_TYPE a3 = {.v = 0 };
+#endif // M0 > 3
+#if M0 > 4
+        union UNION_VEC_TYPE a4 = {.v = 0 };
+#endif // M0 > 4
+#if M0 > 5
+        union UNION_VEC_TYPE a5 = {.v = 0 };
+#endif // M0 > 5
+#if M0 > 6
+        union UNION_VEC_TYPE a6 = {.v = 0 };
+#endif // M0 > 6
+#if M0 > 7
+        union UNION_VEC_TYPE a7 = {.v = 0 };
+#endif // M0 > 7
+
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+
+        // Load from RHS matrix
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Load from LHS matrix
+        for(int k = 0; k < LEFTOVER_K; ++k)
+        {
+            a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+#if M0 > 1
+            a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+#endif // M0 > 1
+#if M0 > 2
+            a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+#endif // M0 > 2
+#if M0 > 3
+            a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+#endif // M0 > 3
+#if M0 > 4
+            a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+#endif // M0 > 4
+#if M0 > 5
+            a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+#endif // M0 > 5
+#if M0 > 6
+            a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+#endif // M0 > 6
+#if M0 > 7
+            a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+#endif // M0 > 7
+
+            lhs_offset += sizeof(DATA_TYPE);
+        }
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0.v, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1.v, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2.v, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3.v, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4.v, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5.v, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6.v, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7.v, b, c7);
+#endif // M0 > 7
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+#undef PIXEL_UNIT
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    })
+#elif M0 == 2 // M0 == 2
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    })
+#elif M0 == 3 // M0 == 3
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    })
+#elif M0 == 4 // M0 == 4
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    })
+#elif M0 == 5 // M0 == 5
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    })
+#elif M0 == 6 // M0 == 6
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    })
+#elif M0 == 7 // M0 == 7
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    })
+#elif M0 == 8 // M0 == 8
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
+                                           IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                           IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                           IMAGE_DECLARATION(dst),
+                                           uint lhs_stride_z,
+                                           uint rhs_stride_z,
+#if defined(BETA)
+                                           uint bias_stride_z,
+#endif //defined(BETA)
+                                           uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                           ,
+                                           uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                           ,
+                                           uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                           ,
+                                           const int M,
+                                           const int N,
+                                           const int K)
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS reshaped matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(0, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(4, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(5, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(6, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(8, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(9, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(A, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(B, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(C, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(D, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(E, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(0, a, b0, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
+                                                   __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                   IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                   IMAGE_DECLARATION(dst),
+                                                   uint lhs_stride_z,
+                                                   uint rhs_stride_z,
+#if defined(BETA)
+                                                   uint bias_stride_z,
+#endif //defined(BETA)
+                                                   uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                   ,
+                                                   uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                   ,
+                                                   uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                   ,
+                                                   const int M,
+                                                   const int N,
+                                                   const int K)
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#define RHS_STEP_LOOP 1
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(0, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(4, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(5, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(6, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(8, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(9, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(A, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(B, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(C, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(D, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(E, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        VFMA_M0xN0(0, a, b0, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        x_rhs += RHS_STEP_X;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+        c += a.s8 * b.s8;   \
+        c += a.s9 * b.s9;   \
+        c += a.sA * b.sA;   \
+        c += a.sB * b.sB;   \
+        c += a.sC * b.sC;   \
+        c += a.sD * b.sD;   \
+        c += a.sE * b.sE;   \
+        c += a.sF * b.sF;   \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else  // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+        c = fma(a.s8, b.s8, c); \
+        c = fma(a.s9, b.s9, c); \
+        c = fma(a.sA, b.sA, c); \
+        c = fma(a.sB, b.sB, c); \
+        c = fma(a.sC, b.sC, c); \
+        c = fma(a.sD, b.sD, c); \
+        c = fma(a.sE, b.sE, c); \
+        c = fma(a.sF, b.sF, c); \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if defined(ARM_DOT_K0XN0)
+#undef ARM_DOT_K0XN0
+#endif // defined(ARM_DOT_K0XN0)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+        ARM_DOT_K0((a), (b##8), (c.s8)); \
+        ARM_DOT_K0((a), (b##9), (c.s9)); \
+        ARM_DOT_K0((a), (b##A), (c.sA)); \
+        ARM_DOT_K0((a), (b##B), (c.sB)); \
+        ARM_DOT_K0((a), (b##C), (c.sC)); \
+        ARM_DOT_K0((a), (b##D), (c.sD)); \
+        ARM_DOT_K0((a), (b##E), (c.sE)); \
+        ARM_DOT_K0((a), (b##F), (c.sF)); \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst),
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                            ,
+                                            const int M,
+                                            const int N,
+                                            const int K)
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
+                                                    __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                    IMAGE_DECLARATION(dst),
+                                                    uint lhs_stride_z,
+                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                    uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                    ,
+                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                    ,
+                                                    const int M,
+                                                    const int N,
+                                                    const int K)
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
+    ({                                                 \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
+    })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+    })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+    })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+    })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+    })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
+// a is the column-vector (transposed)
+// b is the row-vector (not transposed)
+// C is the output matrix
+// Lower case is a vector (a, b)
+// Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
+    ({                                                        \
+        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+    })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
+    (M0, N0, TYPE, A, B, C)
+
+#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ *
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst),
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                            ,
+                                            const int M,
+                                            const int N,
+                                            const int K)
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+        rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)
+
+#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at runtime.
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
+                                                    __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                    IMAGE_DECLARATION(dst),
+                                                    uint lhs_stride_z,
+                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                    uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                    ,
+                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                    ,
+                                                    const int M,
+                                                    const int N,
+                                                    const int K)
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+        x_rhs += K0 * RHS_STEP_X;
+#ifndef RHS_INTERLEAVE
+        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)
+
+#endif // defined(LHS_TRANSPOSE)
+
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+#if defined(GEMM_MM_NATIVE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  M                                  Number of rows in LHS matrix not reshaped.
+ * @param[in]  N                                  Number of columns in RHS matrix not reshaped.
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
+                             IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                             IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                             IMAGE_DECLARATION(dst),
+                             uint lhs_stride_z,
+                             uint rhs_stride_z,
+#if defined(BETA)
+                             uint bias_stride_z,
+#endif //defined(BETA)
+                             uint      dst_stride_z,
+                             const int M,
+                             const int N,
+                             const int K
+#if defined(REINTERPRET_INPUT_AS_3D)
+                             ,
+                             uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                             ,
+                             uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                            )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+#if K0 > 1
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+        RHS_VFMA_M0xN0(0, a, b0, c);
+        RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+        RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+        RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+        RHS_VFMA_M0xN0(4, a, b4, c);
+        RHS_VFMA_M0xN0(5, a, b5, c);
+        RHS_VFMA_M0xN0(6, a, b6, c);
+        RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+        RHS_VFMA_M0xN0(8, a, b8, c);
+        RHS_VFMA_M0xN0(9, a, b9, c);
+        RHS_VFMA_M0xN0(A, a, bA, c);
+        RHS_VFMA_M0xN0(B, a, bB, c);
+        RHS_VFMA_M0xN0(C, a, bC, c);
+        RHS_VFMA_M0xN0(D, a, bD, c);
+        RHS_VFMA_M0xN0(E, a, bE, c);
+        RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * rhs_stride_y;
+    }
+#endif // K0 > 1
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+        RHS_VFMA_M0xN0(0, a, b, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += rhs_stride_y;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(GEMM_MM_NATIVE)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)
+
+#if defined(BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
+                          TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Load values from A x B
+    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+    // Load values from Matrix C
+    float4 c = vload4(0, (__global float *)src.ptr);
+
+    // Computes alpha * axb + beta * c
+    float4 out = alpha_ab + (float4)BETA * c;
+
+    // Store final result in axb matrix
+    vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
+                          TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Load values from A x B
+    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+    // Load values from Matrix C
+    half8 c = vload8(0, (__global half *)src.ptr);
+
+    // Computes alpha * axb + beta * c
+    half8 out = alpha_ab + (half8)BETA * c;
+
+    // Store final result in axb matrix
+    vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(BETA)
diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
new file mode 100644
index 0000000000..09b8956b68
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
@@ -0,0 +1,556 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS tensor. Supported data types: F16/F32
+ * @param[in]  lhs_stride_y                      Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                      Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                             The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                             The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                             The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                      Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                             The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                             The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                             The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  M                                 Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                 Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_mmul(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#if defined(BETA)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(BETA)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0)
+
+    uint x0 = get_global_id(0); // (N / N0) * MMUL_K0
+    uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
+    uint z  = get_global_id(2); // Batch
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+
+    // Starting LHS coordinates
+    uint lhs_x = block_x;
+    uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+    uint rhs_y = block_id;
+
+    // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef BATCHED_RHS
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+#else // BATCHED_RHS
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y;
+#endif // BATCHED_RHS
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
+
+    // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for
+    // not sliding the tensor
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+#if !defined(HALF_PRECISION)
+#define c c_f32
+#endif // !defined(HALF_PRECISION)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, 0, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * MMUL_N0 * N0 * sizeof(DATA_TYPE);
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#endif // defined(HALF_PRECISION)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE);
+
+    TILE(DATA_TYPE, 1, N0, bias0);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c);
+#else // defined(BROADCAST_BIAS)
+    TILE(DATA_TYPE, M0, N0, bias0);
+
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z;
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VLOAD_PARTIAL(N0, N0_LEFTOVER)
+                (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    T_ADD(DATA_TYPE, M0, N0, c, bias0, c);
+    // c = c + bias
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL)
+
+#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_MMUL_TEXTURE)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension and the OpenCL image for RHS:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is NOT transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=2)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=2)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS tensor. Supported data types: F16/F32
+ * @param[in]  lhs_stride_y                      Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                      Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                             The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                             The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                             The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                      Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                             The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                             The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                             The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bia_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                             The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                             The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  M                                 Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                 Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_mmul_texture(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, IMAGE),
+#if defined(BETA)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(BETA)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_K0)
+
+    uint x0 = get_global_id(0); // (N / N0) * MMUL_K0
+    uint y0 = get_global_id(1); // (M / M0) / MMUL_M0
+    uint z  = get_global_id(2); // Batch
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+
+    // Starting LHS coordinates
+    uint lhs_x = block_x;
+    uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+
+#ifdef BATCHED_RHS
+    uint rhs_y = block_id + z * rhs_h;
+#else // BATCHED_RHS
+    uint rhs_y = block_id;
+#endif // BATCHED_RHS
+
+    // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+#if !defined(HALF_PRECISION)
+#define c c_f32
+#endif // !defined(HALF_PRECISION)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, IMAGE, rhs, rhs_x, rhs_y, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_x += MMUL_K0 * MMUL_N0 * N0;
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#endif // defined(HALF_PRECISION)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, c, (DATA_TYPE)ALPHA, c);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE);
+
+    TILE(DATA_TYPE, 1, N0, bias0);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias0[0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias0[0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, 1, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_X(V_ADD, DATA_TYPE, M0, N0, c, bias0, c);
+#else // defined(BROADCAST_BIAS)
+    TILE(DATA_TYPE, M0, N0, bias0);
+
+    bia_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * bia_stride_y + z * bia_stride_z;
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                bias0[m0].v = VLOAD(N0)(0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VLOAD_PARTIAL(N0, N0_LEFTOVER)
+                (bias0[m0].v, 0, (DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes + m0 * bia_stride_y));
+            }
+        })
+    }
+
+#ifndef UNIT_BETA
+    T_SCALE_CONSTANT(DATA_TYPE, M0, N0, bias0, (DATA_TYPE)BETA, bias0);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    T_ADD(DATA_TYPE, M0, N0, c, bias0, c);
+    // c = c + bias
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
diff --git a/src/core/CL/cl_kernels/common/gemm_utils.cl b/src/core/CL/cl_kernels/common/gemm_utils.cl
new file mode 100644
index 0000000000..be57d94ce6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm_utils.cl
@@ -0,0 +1,458 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(RESHAPE_LHS_NT)
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
+ * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0.
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_T(src, BUFFER),
+                                         TENSOR3D_T(dst, BUFFER),
+                                         const int M,
+                                         const int V0)
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
+    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
+    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
+
+    const int xi = x * K0;
+    const int yi = y * M0;
+
+    const int xo = x * BLOCK_SIZE * V0 + (y % V0) * OUTPUT_OFFSET_X;
+    const int yo = (y / V0);
+
+    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, M0, K0, in);
+
+    // Initialize the input tile to zero
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in[_i].v = 0;
+    });
+
+    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
+    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
+    // Load input tile
+    TILE(uint, M0, 1, in_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in_indirect_y[_i].v = _i;
+
+    });
+#if PARTIAL_M0 != 0
+    if(y_cond)
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    else
+#endif // PARTIAL_M0 != 0
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+
+    // Store output tile
+    TILE(uint, M0, 1, dst_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        dst_indirect_y[_i].v = _i;
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_LHS_NT)
+
+#if defined(RESHAPE_LHS_T)
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_M0 (e.g. -DPARTIAL_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_K0 (e.g. -DPARTIAL_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] M                                 The size of height dimension of the source tensor, affected by reinterpret_input_as_3d
+ * @param[in] V0                                The number of blocks to place on the same row. It must be greater than 0
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_T(src, BUFFER),
+                                        TENSOR3D_T(dst, BUFFER),
+                                        const int M,
+                                        const int V0)
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0); // K
+    const int y = GET_SPATIAL_IDX(1, 1, 0); // M
+    const int z = GET_SPATIAL_IDX(2, 1, 0); // Batch size
+
+    const int xi = x * K0;
+    const int yi = y * M0;
+
+    const int xo = x * BLOCK_SIZE * V0 + ((y % V0) * OUTPUT_OFFSET_X);
+    const int yo = (y / V0);
+
+    // src_stride_z is expressed as M * src_stride_y, to handle case where reinterpret_input_as_3d=true
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * M * src_stride_y;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, M0, K0, in);
+    TILE(DATA_TYPE, K0, M0, in_tr);
+
+    // Initialize the tile to zero
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in[_i].v = 0;
+    });
+
+    // Load input tile
+    bool x_cond = (xi + K0 >= src_w) && (PARTIAL_K0 != 0);
+    bool y_cond = (yi + M0 >= M) && (PARTIAL_M0 != 0);
+
+    TILE(uint, M0, 1, in_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        in_indirect_y[_i].v = _i;
+
+    });
+#if PARTIAL_M0 != 0
+    if(y_cond)
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, PARTIAL_M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    else
+#endif // PARTIAL_M0 != 0
+    {
+        T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, K0, PARTIAL_K0, BUFFER, src, xi, src_stride_y, x_cond, in, in_indirect_y);
+    }
+    // Transpose input tile
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, k0, 0, 1, K0,
+        {
+            in_tr[k0].s[m0] = in[m0].s[k0];
+        })
+    });
+
+    TILE(uint, K0, 1, dst_indirect_y);
+    LOOP_UNROLLING(int, _i, 0, 1, K0,
+    {
+        dst_indirect_y[_i].v = _i;
+    });
+
+    // Store output tile
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, M0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_LHS_T)
+
+#if defined(RESHAPE_RHS_NT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_T(src, BUFFER),
+                                         TENSOR3D_T(dst, BUFFER),
+                                         const int H0)
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0);
+    const int y = GET_SPATIAL_IDX(1, 1, 0);
+    const int z = GET_SPATIAL_IDX(2, 1, 0);
+
+    const int xi = x * N0;
+    const int yi = y * K0;
+
+    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
+    const int yo = (x / H0);
+
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, K0, N0, in);
+
+    // Initialize the tile to zero
+    for(int i = 0; i < K0; ++i)
+    {
+        in[i].v = 0;
+    }
+
+    // Load input tile
+    for(int i = 0; i < K0; ++i)
+    {
+        if(yi + i < src_h)
+        {
+            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
+        }
+    }
+
+    TILE(uint, K0, 1, dst_indirect_y);
+    for(int i = 0; i < K0; ++i)
+    {
+        dst_indirect_y[i].v = i;
+    }
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, K0, N0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(RESHAPE_RHS_NT)
+
+#if defined(RESHAPE_RHS_T)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_w                             The size of the width dimension of the source tensor
+ * @param[in] src_h                             The size of the height dimension of the source tensor
+ * @param[in] src_n                             The size of the depth dimension of the source tensor
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: All
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_w                             The size of the width dimension of the destination tensor
+ * @param[in] dst_h                             The size of the height dimension of the destination tensor
+ * @param[in] dst_n                             The size of the depth dimension of the destination tensor
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] H0                                The number of blocks to place on the same row. It must be greater than 0.
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_T(src, BUFFER),
+                                        TENSOR3D_T(dst, BUFFER),
+                                        const int H0)
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    const int x = GET_SPATIAL_IDX(0, 1, 0);
+    const int y = GET_SPATIAL_IDX(1, 1, 0);
+    const int z = GET_SPATIAL_IDX(2, 1, 0);
+
+    const int xi = x * N0;
+    const int yi = y * K0;
+
+    const int xo = y * BLOCK_SIZE * H0 + (x % H0) * OUTPUT_OFFSET_X;
+    const int yo = (x / H0);
+
+    src_offset_first_element_in_bytes += yi * src_stride_y + z * src_stride_z;
+    dst_offset_first_element_in_bytes += yo * dst_stride_y + z * dst_stride_z;
+
+    TILE(DATA_TYPE, K0, N0, in);
+    TILE(DATA_TYPE, N0, K0, in_tr);
+
+    // Initialize the tile to zero
+    for(int i = 0; i < K0; ++i)
+    {
+        in[i].v = 0;
+    }
+
+    // Load input tile
+    for(int i = 0; i < K0; ++i)
+    {
+        if(yi + i < src_h)
+        {
+            in[i].v = V_LOAD(DATA_TYPE, N0, BUFFER, src, xi, i, src_stride_y);
+        }
+    }
+
+    // Transpose input tile
+    for(int k0 = 0; k0 < K0; ++k0)
+    {
+        for(int n0 = 0; n0 < N0; ++n0)
+        {
+            in_tr[n0].s[k0] = in[k0].s[n0];
+        }
+    }
+
+    TILE(uint, N0, 1, dst_indirect_y);
+    for(int i = 0; i < N0; ++i)
+    {
+        dst_indirect_y[i].v = i;
+    }
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, N0, K0, 0, BUFFER, dst, xo, (OUTPUT_STEP_X * sizeof(DATA_TYPE)), false, in_tr, dst_indirect_y);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#endif // defined(RESHAPE_RHS_T)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
new file mode 100644
index 0000000000..62c4cd31f5
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
@@ -0,0 +1,2162 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers_asymm.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#define ARM_DOT1(a, b, c)                                                                                                                               \
+    ({                                                                                                                                                  \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
+    })
+#define ARM_DOT2(a, b, c)                                                                                                                               \
+    ({                                                                                                                                                  \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
+    })
+#define ARM_DOT3(a, b, c)                                                                                           \
+    ({                                                                                                              \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
+    })
+#define ARM_DOT4(a, b, c) \
+    ({                    \
+        ARM_DOT(a, b, c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
+#define ARM_DOT1(a, b, c)          \
+    ({                             \
+        c += (ACC_DATA_TYPE)a * b; \
+    })
+#define ARM_DOT2(a, b, c)                \
+    ({                                   \
+        c += (ACC_DATA_TYPE)a.s0 * b.s0; \
+        c += (ACC_DATA_TYPE)a.s1 * b.s1; \
+    })
+#define ARM_DOT3(a, b, c)                \
+    ({                                   \
+        ARM_DOT2(a, b, c);               \
+        c += (ACC_DATA_TYPE)a.s2 * b.s2; \
+    })
+#define ARM_DOT4(a, b, c)                \
+    ({                                   \
+        ARM_DOT3(a, b, c);               \
+        c += (ACC_DATA_TYPE)a.s3 * b.s3; \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
+#define ARM_DOT_K0X1(k0, a, b, c)         \
+    ({                                    \
+        ARM_DOT_K0(k0, (a), (b##0), (c)); \
+    })
+#define ARM_DOT_K0X2(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
+        ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
+    })
+#define ARM_DOT_K0X3(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X2(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
+    })
+#define ARM_DOT_K0X4(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X3(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
+    })
+#define ARM_DOT_K0X8(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X4(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
+        ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
+        ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
+        ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
+    })
+#define ARM_DOT_K0X16(k0, a, b, c)           \
+    ({                                       \
+        ARM_DOT_K0X8(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
+        ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
+        ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
+        ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
+        ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
+        ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
+        ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
+        ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
+    })
+
+/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_K0XN0X1(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
+    })
+#define ARM_MM_K0XN0X2(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
+    })
+#define ARM_MM_K0XN0X3(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
+    })
+#define ARM_MM_K0XN0X4(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
+    })
+#define ARM_MM_K0XN0X5(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
+    })
+#define ARM_MM_K0XN0X6(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
+    })
+#define ARM_MM_K0XN0X7(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
+    })
+#define ARM_MM_K0XN0X8(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
+    })
+
+#define ARM_DOT_K0(k0, a, b, c) \
+    ({                          \
+        CONCAT(ARM_DOT, k0)     \
+        ((a), (b), (c));        \
+    })
+
+#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
+    ({                                 \
+        CONCAT(ARM_DOT_K0X, n0)        \
+        (k0, (a), b, (c));             \
+    })
+
+#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
+    ({                                       \
+        CONCAT(ARM_MM_K0XN0X, m0)            \
+        (n0, k0, a, b, c);                   \
+    })
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
+#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c)   \
+    ({                                           \
+        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
+    })
+#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
+        c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
+    })
+#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
+    })
+#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
+    })
+#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
+        c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
+        c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
+        c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
+    })
+#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)       \
+    ({                                                \
+        ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
+        c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
+        c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
+        c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
+        c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
+        c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
+        c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
+        c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
+    })
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
+    })
+#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
+    ({                                              \
+        CONCAT(ARM_MUL_N0X, k0)                     \
+        (VECTOR_ACC_TYPE, (a), b, (c));             \
+    })
+#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
+    ({                                                           \
+        CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                         \
+        (VECTOR_ACC_TYPE, k0, a, b, c);                          \
+    })
+
+#if defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                                IMAGE_DECLARATION(rhs),
+                                                IMAGE_DECLARATION(dst),
+                                                uint k,
+                                                uint lhs_stride_z,
+                                                uint rhs_stride_z,
+                                                uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                ,
+                                                uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                               )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
+
+    // Compute RHS matrix address
+    __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
+
+        // Partial matrix multiplication M0,N0,K0
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+        // Update address
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Convert and store output block
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(GEMMLOWP_MM_RESHAPED_LHS_NT_RHS_T)
+
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+#define FUSED_OUTPUT_STAGE_FIXED_POINT
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage using fixed-point arithmetic.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed at compile time.
+ *
+ * @param[in]  lhs_ptr                                          Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                                     Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                                     Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes                The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                                          Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                                     Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                                     Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes                The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                                          Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                                     Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                                     Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                                     Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                              (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                              (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: S32
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
+__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint
+#elif defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T) // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT)
+__kernel void gemmlowp_mm_reshaped_only_rhs_t
+#endif                                         // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
+(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+#if defined(PER_CHANNEL_QUANTIZATION)
+ ,
+ VECTOR_DECLARATION(result_multipliers),
+ VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+    // @note: replace with (DIMENSION + PAD) once we pass the relevant info at compile time
+#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y)
+#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y)
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X (K0 * H0)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0 * N0)
+#define RHS_STEP_X (K0)
+#endif // defined(RHS_INTERLEAVE)
+#define RHS_STEP_LOOP (N0 * K0 * H0)
+
+    uint x  = GET_SPATIAL_IDX(0, 1, 1);
+    uint y  = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    uint z  = GET_SPATIAL_IDX(2, 1, 1);
+    int  xo = (x * N0);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((xo >= N) || (y >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_y = y + z * FULL_LHS_HEIGHT;
+
+    // Compute RHS matrix address
+    uint rhs_offset_x = (x % H0) * RHS_OFFSET_X;
+    uint rhs_offset_y = (x / H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset_y += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load values from LHS matrix
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
+
+        // // Load values from RHS matrix
+        LOOP_UNROLLING(int, _i, 0, 1, N0,
+        {
+            b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X));
+        })
+
+        // Partial matrix multiplication M0,N0,K0
+        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+        rhs_offset_x += RHS_STEP_LOOP;
+    }
+
+#if((K % K0) != 0)
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load values from LHS matrix
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
+
+        LOOP_UNROLLING(int, _i, 0, 1, N0,
+        {
+            b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X);
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+        rhs_offset_x += 1;
+    }
+#endif // ((K % K0) != 0)
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(int, M0, N0, c_int);
+    TILE(int, M0, N0, offset_s32);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
+    })
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
+    })
+
+#if defined(A_OFFSET)
+
+#if defined(SUM_COL_HAS_BATCHES)
+    int sum_col_y = z;
+#else  // defined(SUM_COL_HAS_BATCHES)
+    int sum_col_y = 0;
+#endif // defined(SUM_COL_HAS_BATCHES)
+    TILE(int, 1, N0, a_offset_s32);
+
+    T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32);
+
+    a_offset_s32[0].v *= A_OFFSET;
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    // Compute the offset contribution due to B_OFFSET
+    // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which
+    // does not introduce paddings. For this reason is safe to access the tensor in this manner
+    // without considering that the coordinate "y" could come from an input 3D tensor
+    TILE(int, M0, N0, b_offset_s32);
+
+    T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v += b_offset_s32[i].v *B_OFFSET;
+    })
+
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+
+    TILE(int, 1, N0, bias);
+
+    T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
+#endif // defined(ADD_BIAS)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_int[i].v += offset_s32[i].v;
+    })
+
+    TILE(DATA_TYPE, M0, N0, c_lp);
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    TILE(int, 1, N0, res_mul);
+    TILE(int, 1, N0, res_shift);
+
+    T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul);
+    T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift);
+
+    T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp);
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+    T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp);
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#if defined(MIN_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND);
+    })
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND);
+    })
+#endif // defined(MAX_BOUND)
+
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    TILE(int, M0, N0, c_lp);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
+    })
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+        dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1);
+        dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT;
+        dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D;
+#else  // (REINTERPRET_OUTPUT_AS_3D)
+        dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+    })
+
+    const bool cond_x = (xo > (N - N0)) & (PARTIAL_STORE_N0 != 0);
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T_FUSED_OUTPUT_STAGE_FIXEDPOINT) || defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_T)
+
+#if defined(GEMMLOWP_MM_NATIVE)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
+                                 IMAGE_DECLARATION(rhs),
+                                 IMAGE_DECLARATION(dst),
+                                 uint lhs_stride_z,
+                                 uint rhs_stride_z,
+                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                 ,
+                                 uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                 ,
+                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                )
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+        // Partial matrix multiplication M0,N0,K0
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        // Transpose the values from RHS matrix
+        TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
+
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        // Update the offset
+        lhs_offset += K0;
+        rhs_offset += K0 * rhs_stride_y;
+    }
+
+    // Left-over for loop
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+        // Partial matrix multiplication M0,N0,1
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        // Transpose the values from RHS matrix
+        TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
+
+        ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        // Update the offset
+        lhs_offset += 1;
+        rhs_offset += rhs_stride_y;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Convert and store output block
+    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(GEMMLOWP_MM_NATIVE)
+
+#if defined(GEMMLOWP_MATRIX_A_REDUCTION)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    sum_row_32            = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
+    ACC_DATA_TYPE sum_row = 0;
+
+    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 16); i += 16)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
+
+        sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF,
+                      VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += (ACC_DATA_TYPE)matrix_a[i];
+    }
+
+    sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+
+#if defined(SCALAR)
+    sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(GEMMLOWP_MATRIX_A_REDUCTION)
+
+#if defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction.
+ * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
+                                               IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    ACC_DATA_TYPE sum_row = 0;
+
+    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 32); i += 32)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        a0 = vload16(0, matrix_a + i);
+
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+
+        a0 = vload16(1, matrix_a + i);
+
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+        DOT_PRODUCT4_INTEGER8(DATA_TYPE, DATA_TYPE, DATA_TYPE, a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1), sum_row);
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += (ACC_DATA_TYPE)matrix_a[i];
+    }
+
+#if defined(SCALAR)
+    sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(GEMMLOWP_MATRIX_A_REDUCTION_DOT8)
+
+#if defined(GEMMLOWP_MATRIX_B_REDUCTION)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const uint y      = get_global_id(1);
+
+    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
+    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0;
+
+    int i = 0;
+    // This for loop performs 4 accumulations
+    for(; i <= ((int)ROWS_B - 4); i += 4)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3,
+                      VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+        matrix_b += 4 * src_stride_y;
+    }
+
+    // This for loop perfoms the leftover accumulations
+    for(; i < (int)ROWS_B; ++i)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b0 = VLOAD(VEC_SIZE)(0, matrix_b);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+        matrix_b += src_stride_y;
+    }
+
+#if defined(SCALAR)
+    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR;
+#endif // defined(SCALAR)
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE));
+
+    STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_MATRIX_B_REDUCTION)
+
+#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+
+/* Helper function used to calculate the offset contribution after matrix multiplication.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in] x                                     max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0)
+ * @param[in] y                                     get_global_id(1)
+ * @param[in] z                                     get_global_id(2)
+ * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
+ */
+inline VEC_INT offset_contribution(
+    int x,
+    int y,
+    int z
+#if defined(A_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+    VEC_INT a_offset_s32 = (VEC_INT)0;
+    VEC_INT b_offset_s32 = (VEC_INT)0;
+
+    int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+    batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+    // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else  // defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+    a_offset_s32 *= (VEC_INT)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+    // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 *= (VEC_INT)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    b_offset_s32 += (VEC_INT)biases_values;
+#endif // defined(ADD_BIAS)
+
+    return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION)
+/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                           ,
+                                           VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
+                                          )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32_0 += offset_term_s32;
+
+    // Store the result with the offset contribution
+    STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION)
+
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                         ,
+#if defined(ADD_BIAS)
+                                                         VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                         TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                         ,
+                                                         VECTOR_DECLARATION(result_multipliers),
+                                                         VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+                                                        )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (VEC_INT)RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
+    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
+
+    in_s32 *= result_multipliers_values;
+    in_s32 >>= result_shifts_values;
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+    in_s32 *= RESULT_MULTIPLIER;
+
+    in_s32 >>= RESULT_SHIFT;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN)
+
+#if defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
+/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                                    ,
+#if defined(ADD_BIAS)
+                                                                    VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                    TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                                    ,
+                                                                    VECTOR_DECLARATION(result_multipliers),
+                                                                    VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+                                                                   )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
+    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
+
+    VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
+    VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
+    in_s32                   = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (VEC_INT)RESULT_OFFSET;
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OFFSET_CONTRIBUTION_QUANTIZE_DOWN_FIXEDPOINT)
+
+#undef VEC_INT
+
+#endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *  -#  - to the [0..255] range and cast to QASYMM8.
+ *  -#  - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Add the offset terms to GEMM's result
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+    input_values *= RESULT_MULT_INT;
+
+#if RESULT_SHIFT < 0
+    input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
+    input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN)
+
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+    // Add the offset terms to GEMM's result
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT)
+
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QSYMM16
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                                     VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                     TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+    VEC_DATA_TYPE(short, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FIXEDPOINT_QSYMM16)
+
+#if defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                           src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+#if defined(DST_HEIGHT)
+                                                        TENSOR4D_DECLARATION(dst))
+#else  // defined(DST_HEIGHT)
+                                                        TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Convert to float
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE));
+    input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(GEMMLOWP_OUTPUT_STAGE_QUANTIZE_DOWN_FLOAT)
diff --git a/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl
new file mode 100644
index 0000000000..72fe3d3b89
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.cl
@@ -0,0 +1,309 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+#if defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_MMUL)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices using the MMUL extension:
+ *
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel and the block K0xN0 is transposed
+ *
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=1, -DK0=1).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=1)
+ * @note The number of output columns processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_N0 (e.g., -DMMUL_N0=4)
+ * @note The number of output rows processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_M0 (e.g., -DMMUL_M0=4)
+ * @note The number of lhs columns (or rhs rows) processed by the the cooperative mmul extension must be passed at compile time using -DMMUL_K0 (e.g., -DMMUL_K0=16)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 4
+ *  - N0 = 1, 4, 8
+ *  - K0 = 4
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ *
+ * @param[in]  lhs_ptr                               Pointer to the LHS tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                          Stride of the LHS tensor in Y dimension (in bytes)
+ * @param[in]  lhs_stride_z                          Stride of the LHS tensor in Z dimension (in bytes)
+ * @param[in]  lhs_w                                 The size of the width dimension of the LHS tensor
+ * @param[in]  lhs_h                                 The size of the height dimension of the LHS tensor
+ * @param[in]  lhs_n                                 The size of the depth dimension of the LHS tensor
+ * @param[in]  lhs_offset_first_element_in_bytes     The offset of the first element in the LHS tensor
+ * @param[in]  rhs_ptr                               Pointer to the RHS reshaped tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                          Stride of the RHS tensor in Y dimension (in bytes)
+ * @param[in]  rhs_stride_z                          Stride of the RHS tensor in Z dimension (in bytes)
+ * @param[in]  rhs_w                                 The size of the width dimension of the RHS tensor
+ * @param[in]  rhs_h                                 The size of the height dimension of the RHS tensor
+ * @param[in]  rhs_n                                 The size of the depth dimension of the RHS tensor
+ * @param[in]  rhs_offset_first_element_in_bytes     The offset of the first element in the RHS tensor
+ * @param[in]  bia_ptr                               (Optional) Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bia_stride_y                          (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bia_stride_z                          (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bia_w                                 (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bia_h                                 (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bia_n                                 (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bia_offset_first_element_in_bytes     (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data type: same as @p lhs_ptr or S32
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_w                                 The size of the width dimension of the destination tensor
+ * @param[in]  dst_h                                 The size of the height dimension of the destination tensor
+ * @param[in]  dst_n                                 The size of the depth dimension of the destination tensor
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  M                                     Number of rows in LHS matrix not reshaped
+ * @param[in]  N                                     Number of columns in RHS matrix not reshaped
+ * @param[in]  K                                     Number of columns in LHS matrix and rows in RHS matrix not reshaped
+ * @param[in]  sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_mmul(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#if defined(ADD_BIAS)
+    TENSOR3D_T(bia, BUFFER),
+#endif // defined(ADD_BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K
+#if defined(A_OFFSET)
+    ,
+    TENSOR3D_T(sum_col, BUFFER)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    TENSOR3D_T(sum_row, BUFFER)
+#endif // defined(B_OFFSET)
+)
+{
+#define MMUL_BLOCK_SIZE (MMUL_N0 * MMUL_M0)
+#define VEC_SIZE 4 // For int8 types input to mmul instruction is a length 4 vector
+
+    uint x0 = get_global_id(0);
+    uint y0 = get_global_id(1);
+    uint z  = get_global_id(2);
+
+    // Get block ID and thread ID within the block
+    uint block_id  = (x0 / MMUL_BLOCK_SIZE);
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+
+    // Coordinate within a block
+    uint block_x = thread_id % MMUL_N0;
+    uint block_y = (thread_id / MMUL_M0);
+
+    // Starting destination coordinates
+    uint dst_x = min(block_x * N0 + block_id * MMUL_N0 * N0, (uint)(N - 1));
+    uint dst_y = min(block_y * M0 + y0 * M0 * MMUL_M0, (uint)(M - M0));
+
+    uint lhs_x = VEC_SIZE * block_x;
+    uint lhs_y = dst_y;
+
+    uint rhs_x = VEC_SIZE * N0 * block_y;
+    uint rhs_y = 4 * block_id + block_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(OUT_DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int k = 0; k <= K - MMUL_K0; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, M0, VEC_SIZE, a);
+        T_LOAD(DATA_TYPE, M0, VEC_SIZE, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        TILE(DATA_TYPE, N0, VEC_SIZE, b);
+        T_LOAD(DATA_TYPE, N0, VEC_SIZE, BUFFER, rhs, 0, 0, 1, VEC_SIZE, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                VEC_TYPE vec_a = (VEC_TYPE)(a[m0].s[0], a[m0].s[1], a[m0].s[2], a[m0].s[3]);
+                VEC_TYPE vec_b = (VEC_TYPE)(b[n0].s[0], b[n0].s[1], b[n0].s[2], b[n0].s[3]);
+                c[m0].s[n0]    = arm_matrix_multiply(vec_a, vec_b, c[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * N0 * sizeof(DATA_TYPE);
+    }
+
+    if(block_x * N0 + block_id * MMUL_N0 * N0 >= N)
+    {
+        return;
+    }
+
+    if(block_y * M0 + y0 * M0 * MMUL_M0 >= M)
+    {
+        return;
+    }
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(int, M0, N0, offset_s32);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
+    })
+
+#if defined(A_OFFSET)
+
+    TILE(int, 1, N0, a_offset_s32);
+
+    T_LOAD(int, 1, N0, BUFFER, sum_col, dst_x, z, 1, sum_col_stride_z, a_offset_s32);
+
+    a_offset_s32[0].v *= A_OFFSET;
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, a_offset_s32, offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+
+    TILE(int, M0, 1, b_offset_s32);
+
+    T_LOAD(int, M0, 1, BUFFER, sum_row, dst_y, z * M, 1, 4, b_offset_s32);
+
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        offset_s32[m0].v += b_offset_s32[m0].v *B_OFFSET;
+    })
+
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+#if defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(ACC_DATA_TYPE) + z * bia_stride_y;
+
+    TILE(int, M0, N0, bias);
+
+    T_LOAD(int, M0, N0, BUFFER, bia, dst_x, dst_y, 1, 1, bias);
+
+    T_ADD(ACC_DATA_TYPE, M0, N0, offset_s32, bias, offset_s32);
+
+#else // defined(BROADCAST_BIAS)
+    bia_offset_first_element_in_bytes += dst_x * sizeof(ACC_DATA_TYPE);
+
+    TILE(int, 1, N0, bias);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        bias[0].v = VLOAD(N0)(0, (ACC_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+    else
+    {
+        VLOAD_PARTIAL(N0, N0_LEFTOVER)
+        (bias[0].v, 0, (ACC_DATA_TYPE *)(bia_ptr + bia_offset_first_element_in_bytes));
+    }
+
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, offset_s32, bias, offset_s32);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(ADD_BIAS)
+
+    T_ADD(ACC_DATA_TYPE, M0, N0, c, offset_s32, c);
+    TILE(OUT_DATA_TYPE, M0, N0, c_lp);
+    T_QUANTIZE8(ACC_DATA_TYPE, OUT_DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c, 0, 0, c_lp);
+
+#if defined(MIN_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(OUT_DATA_TYPE, N0))MIN_BOUND);
+    })
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(OUT_DATA_TYPE, N0))MAX_BOUND);
+    })
+#endif // defined(MAX_BOUND)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c_lp[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c_lp[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#else  // FUSED_OUTPUT_STAGE_FIXED_POINT
+    // Store
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global OUT_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+#endif // FUSED_OUTPUT_STAGE_FIXED_POINT
+}
+
+#endif // defined(GEMMLOWP_MM_RESHAPED_ONLY_RHS_MMUL)
diff --git a/src/core/CL/cl_kernels/common/gemv.cl b/src/core/CL/cl_kernels/common/gemv.cl
new file mode 100644
index 0000000000..71a372eb29
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemv.cl
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+/** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
+ *
+ * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ */
+__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    DATA_TYPE acc0 = (DATA_TYPE)0;
+    DATA_TYPE acc1 = (DATA_TYPE)0;
+    DATA_TYPE acc2 = (DATA_TYPE)0;
+    DATA_TYPE acc3 = (DATA_TYPE)0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));
+
+        acc0 += dot(weights, tmp0);
+        acc1 += dot(weights, tmp1);
+        acc2 += dot(weights, tmp2);
+        acc3 += dot(weights, tmp3);
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);
+        vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+
+/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: S32
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  input_offset                          Input's quantization offset
+ * @param[in]  weights_offset                        Weights's quantization offset
+ */
+__kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
+                                IMAGE_DECLARATION(weights),
+                                VECTOR_DECLARATION(dst),
+                                const int input_offset,
+                                const int weights_offset)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    int acc0 = 0;
+    int acc1 = 0;
+    int acc2 = 0;
+    int acc3 = 0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;
+        int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;
+        int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;
+        int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;
+
+        // Accumulate
+        acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
+        acc1 += tmp1.s0 * w.s0 + tmp1.s1 * w.s1 + tmp1.s2 * w.s2 + tmp1.s3 * w.s3;
+        acc2 += tmp2.s0 * w.s0 + tmp2.s1 * w.s1 + tmp2.s2 * w.s2 + tmp2.s3 * w.s3;
+        acc3 += tmp3.s0 * w.s0 + tmp3.s1 * w.s1 + tmp3.s2 * w.s2 + tmp3.s3 * w.s3;
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global int *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global int *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global int *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global int *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
diff --git a/src/core/CL/cl_kernels/common/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl
new file mode 100644
index 0000000000..bfe1922ac2
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/generate_proposals.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: F16/F32
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
+__kernel void generate_proposals_compute_all_anchors(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const unsigned int idx = get_global_id(0);
+    // Find the index of the anchor
+    const unsigned int anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const unsigned int shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
+
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    anchor = vload4(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS));
+
+    // Apply the shift to the anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    vstore4(shifted_anchor, 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
diff --git a/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
new file mode 100644
index 0000000000..70f861c4b7
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM8
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: QASYMM8
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
+__kernel void generate_proposals_compute_all_anchors_quantized(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const size_t idx = get_global_id(0);
+    // Find the index of the anchor
+    const size_t anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const size_t shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
+
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(float, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    anchor = DEQUANTIZE(VLOAD(NUM_ROI_FIELDS)(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS)), OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS);
+
+    // Apply the shift to the anchor
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    VSTORE(NUM_ROI_FIELDS)
+    (QUANTIZE(shifted_anchor, OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS), 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
diff --git a/src/core/CL/cl_kernels/common/instance_normalization.cl b/src/core/CL/cl_kernels/common/instance_normalization.cl
new file mode 100644
index 0000000000..f9b3cd3620
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/instance_normalization.cl
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2019-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function computes the mean and variance of each plane of the input tensor and provides it as output.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void compute_mean_var(
+    TENSOR4D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int          ch             = get_global_id(0); // Current channel
+    const int          batch          = get_global_id(1); // Current batch
+    const int          elements_plane = DIM_Y * DIM_Z;
+    INTERNAL_DATA_TYPE part_sum       = 0.f;
+    INTERNAL_DATA_TYPE part_sum_sq    = 0.f;
+    const int          in_offset      = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+
+    for(int i_w = 0; i_w < DIM_Y; ++i_w)
+    {
+        for(int i_h = 0; i_h < DIM_Z; ++i_h)
+        {
+            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+    }
+
+    INTERNAL_DATA_TYPE mean                      = (part_sum / elements_plane);
+    INTERNAL_DATA_TYPE var                       = (part_sum_sq / elements_plane) - (mean * mean);
+    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
+    *output_address0                             = mean;
+    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
+    *output_address1                             = var;
+#else // !defined(NHWC)
+    const int ch             = get_global_id(2) % DIM_Z; // Current channel
+    const int batch          = get_global_id(2) / DIM_Z; // Current batch
+    const int elements_plane = DIM_X * DIM_Y;
+
+    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+    part_sum = 0.f;
+    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+    part_sum_sq = 0.f;
+    // Calculate partial sum
+    for(int y = 0; y < DIM_Y; ++y)
+    {
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            // Load data
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)));
+            part_sum.s0 += data;
+            part_sum_sq.s0 += data * data;
+        }
+    }
+    // Perform reduction
+#if VEC_SIZE > 8
+    part_sum.s01234567 += part_sum.s89abcdef;
+    part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
+#endif // VEC_SIZE > 8
+#if VEC_SIZE > 4
+    part_sum.s0123 += part_sum.s4567;
+    part_sum_sq.s0123 += part_sum_sq.s4567;
+#endif // VEC_SIZE > 4
+#if VEC_SIZE > 2
+    part_sum.s01 += part_sum.s23;
+    part_sum_sq.s01 += part_sum_sq.s23;
+#endif // VEC_SIZE > 2
+    part_sum.s0 += part_sum.s1;
+    part_sum_sq.s0 += part_sum_sq.s1;
+
+    INTERNAL_DATA_TYPE sum    = (INTERNAL_DATA_TYPE)part_sum.s0;
+    INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
+
+    const INTERNAL_DATA_TYPE mean = (sum / elements_plane);
+    const INTERNAL_DATA_TYPE var  = (sum_sq / elements_plane) - (mean * mean);
+
+    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
+    *output_address0                             = mean;
+    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
+    *output_address1                             = var;
+
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
+ * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
+ * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void instance_normalization(
+    TENSOR4D_DECLARATION(input),
+    TENSOR3D_DECLARATION(mean_var)
+#ifndef IN_PLACE
+    ,
+    TENSOR4D_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    Tensor4D in       = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var);
+#ifndef IN_PLACE
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+#endif /* IN_PLACE */
+
+#if defined(NHWC)
+    const int ch    = get_global_id(0); // Current channel
+    const int batch = get_global_id(2); // Current batch
+#else                                   /* defined(NHWC) */
+    const int ch    = get_global_id(2) % DIM_Z; // Current channel
+    const int batch = get_global_id(2) / DIM_Z; // Current batch
+#endif                                  /* defined(NHWC) */
+
+    const __global INTERNAL_DATA_TYPE *mean_ptr                   = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch);
+    const __global INTERNAL_DATA_TYPE *var_ptr                    = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch);
+    const INTERNAL_DATA_TYPE                               mean   = (INTERNAL_DATA_TYPE) * mean_ptr;
+    const INTERNAL_DATA_TYPE                               var    = (INTERNAL_DATA_TYPE) * var_ptr;
+    const INTERNAL_DATA_TYPE                               multip = GAMMA / sqrt(var + EPSILON);
+    const INTERNAL_DATA_TYPE                               beta   = (INTERNAL_DATA_TYPE)BETA;
+
+#if defined(NHWC)
+    const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+#ifndef IN_PLACE
+    const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+#endif /* IN_PLACE */
+
+    for(int i_w = 0; i_w < DIM_Y; ++i_w)
+    {
+        for(int i_h = 0; i_h < DIM_Z; ++i_h)
+        {
+            __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+#endif /* IN_PLACE */
+            *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+        }
+    }
+#else // !defined(NHWC)
+    for(int y = 0; y < DIM_Y; ++y)
+    {
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
+
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+            VSTORE(VEC_SIZE)
+            (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address);
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+            *(output_address)                  = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+        }
+    }
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
diff --git a/src/core/CL/cl_kernels/common/l2_normalize.cl b/src/core/CL/cl_kernels/common/l2_normalize.cl
new file mode 100644
index 0000000000..fbe3406239
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/l2_normalize.cl
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
+/** This kernel performs l2 normalization on x-axis
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_x(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + get_global_id(1) * sum_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(*((__global DATA_TYPE *)sum_addr), epsilon));
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * normalize_value;
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+
+/** This kernel performs l2 normalization on y-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_y(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    sums = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(sums, epsilon));
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * normalize_value;
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+
+/** This kernel performs l2 normalization on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                           sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * sum_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    sums = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))(rsqrt(fmax(sums, epsilon))));
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/mat_mul.cl b/src/core/CL/cl_kernels/common/mat_mul.cl
new file mode 100644
index 0000000000..c7ef8ae52b
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul.cl
@@ -0,0 +1,708 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for float/half datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+    TILE(DATA_TYPE, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, b, acc);
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, b, acc);
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_nt_t(TENSOR3D_T(lhs, BUFFER),
+                                  TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+                                  TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+                                  TENSOR3D_T(dst, BUFFER))
+
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // This part is written to decrease the number of loop unrollings caused
+        // by T_MMUL. The NT/NT version is partly vectorized and uses less number
+        // of loop unrollings, and code behaves as expected. Although this is not
+        // a performant solution for the specified architecture, it is necessary
+        // to overcome some limitations.
+        TILE(DATA_TYPE, K0, N0, bt);
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                bt[j].s[i] = b[i].s[j];
+            })
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, a, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // See the main loop for the explanation of this part
+        TILE(DATA_TYPE, 1, N0, bt);
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            bt[0].s[i] = b[i].s[0];
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, a, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_NT_T)
+
+#if defined(MAT_MUL_NATIVE_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ *  - K0 > 0
+ * * @note Values > 8 for M0, and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, RHS_TENSOR_TYPE, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, K0, at);
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                at[j].s[i] = a[i].s[j];
+            })
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, b, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, NT, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, x, k + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, 1, at);
+        LOOP_UNROLLING(int, j, 0, 1, M0,
+        {
+            at[j].s[0] = a[0].s[j];
+        })
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, b, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, NT, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_T_NT)
+
+#if defined(MAT_MUL_NATIVE_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the rhs tensor must be passed at compile time using -DRHS_TENSOR_TYPE (e.g. -DRHS_TENSOR_TYPE=BUFFER)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16 (only 4, 8, 16 if RHS_TENSOR_TYPE=IMAGE)
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_img                            (Optional) Read only cl_image object for the rhs tensor. Included when RHS_TENSOR_TYPE=IMAGE
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr,
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, RHS_TENSOR_TYPE),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(DATA_TYPE, M0, N0, acc);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = 0.f;
+    })
+
+    const int rhs_z = z * rhs_h;
+    int       k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, RHS_TENSOR_TYPE, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, K0, at);
+        TILE(DATA_TYPE, K0, N0, bt);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                at[j].s[i] = a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                bt[j].s[i] = b[i].s[j];
+            })
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, NT, NT, at, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, K0, T, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+    }
+
+#if K % K0 != 0
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            a[i].v = 0.f;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0.f;
+        })
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, k, x + rhs_z, 1, rhs_stride_y, b);
+
+#if GPU_ARCH == GPU_ARCH_MIDGARD
+        // For explanation, see mat_mul_native_nt_t
+        TILE(DATA_TYPE, M0, 1, at);
+        TILE(DATA_TYPE, 1, N0, bt);
+
+        LOOP_UNROLLING(int, j, 0, 1, M0,
+        {
+            at[j].s[0] = a[0].s[j];
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            bt[0].s[i] = b[i].s[0];
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, NT, NT, at, bt, acc);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        T_MMUL(DATA_TYPE, DATA_TYPE, DATA_TYPE, M0, N0, 1, T, T, a, b, acc);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+    }
+#endif // K % K0 != 0
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, acc, acc);
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, acc, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
new file mode 100644
index 0000000000..e549da86d4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_mmul.cl
@@ -0,0 +1,946 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for float and half datatypes when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(DATA_TYPE, M0, N0, acc), uint x)
+{
+    TILE(DATA_TYPE, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(DATA_TYPE, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_MMUL_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of columns in LHS matrix and rows in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix
+
+    // The output/destination matrix is divided into "sections". Each section is filled by a group of
+    // threads of size MMUL_BLOCK_SIZE, bundled together according to GWS_x.
+    // Each thread writes to a tile of M0 x N0 (the usual output block size for a thread) in the output matrix.
+    // Therefore, the section dimensions are (MMUL_M0 x M0) x (MMUL_N0 x N0).
+
+    // The GWS is constructed in such a way that the y global id is the y section coordinate,
+    // and the x global id is a transformed thread id: MMUL_BLOCK_SIZE number of consecutive threads
+    // in the x dimension corresponding to a section.
+    // This can be visualized as first obtaining the coordinates of all the sections:
+    // x = [0, (N / N0) / MMUL_N0) --> (N / N0) / MMUL_N0 is the number of sections in x dimension
+    // y = [0, (M / M0) / MMUL_M0) --> (M / M0) / MMUL_M0 is the number of sections in y dimension
+    // Then multiply the x coordinates with MMUL_SECTION_NUM_THREADS to get the consecutive thread ids in the x dimension
+    // x = [0, ((N / N0) / MMUL_N0) * MMUL_N0 * MMUL_M0)
+    // x = [0, (N / N0) * MMUL_MO)
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Within these sections, each thread writes onto a small output block of size M0 x N0
+    // in row major order. A section divided into tiles can be visualized as below.
+    //
+    //                   (Figure 1)
+    //          A Section in the Output Matrix
+    //
+    //    _____N0__________N0____________________N0____
+    //    |           |          |         |           |
+    //    |           |          |         |           |
+    // M0 |  Thread 1 | Thread 2 |   ...   |  Thread   |
+    //    |           |          |         |  MMUL_N0  |
+    //    |___________|__________|_________|___________|
+    //    |           |                    |           |
+    //    |           |                    |           |
+    // M0 |  Thread   |     .              |           |
+    //    | MMUL_N0+1 |       .            |           | (M0 x MMUL_M0)
+    //    |___________|         .          |           |
+    //    |     .                          |           |
+    //    |     .                          |           |
+    //    |     .                          |           |
+    //    |                                |___________|
+    //    |                                |           |
+    //    |                                |  Thread   |
+    // M0 |                                | MMUL_N0 x |
+    //    |                                | MMUL_M0   |
+    //    |________________________________|___________|
+    //                  N0 x MMUL_N0
+    //
+    // The output matrix has several of these sections. As shown above, each section
+    // will be filled by a separate thread group of size MMUL_BLOCK_SIZE. The overall
+    // section layout of the output matrix is as below. For instance, S(1,1) will be filled
+    // by MMUL_BLOCK_SIZE (possibly equal to 16) threads, so as S(0,1) and others.
+    //
+    //                          (Figure 2)
+    //                          DST Matrix
+    //              ____________________________________
+    //             |        |        |       |         |
+    //             | S(0,0) | S(0,1) | ...   | S(0, X) |
+    //             |________|________|_______|_________|
+    //             |        |        |       |         |
+    //             | S(1,0) | S(1,1) | ...   | S(1, X) |
+    //             |________|________|_______|_________|
+    //             |   .    |        |                 |
+    //             |   .    |        |                 |        Y = (M / M0) / MMUL_M0 - 1 (Max possible section y coordinate)
+    //             |   .    |        |                 |        X = (N / N0) / MMUL_N0 - 1 (Max possible section x coordinate)
+    //             |________|________|_________________|
+    //             |        |        |       |         |        S(y, x) denotes the section, and y and x are computed in
+    //             | S(Y,0) | S(Y,1) |       | S(Y, X) |        section_y, section_x respectively.
+    //             |________|________|_______|_________|
+    //
+    //
+    //
+    //
+    // A complete view involving the three matrices is given below. It examplifies how the section S(0,0) is computed.
+    //
+    //                                                    (Figure 3)
+    //                                                  Complete View
+    //
+    //                       LHS Matrix                             RHS Matrix                                          DST Matrix
+    //
+    //                   ___MMUL_K0___________               __MMUL_N0 x N0____________                     ___MMUL_N0 x N0____________________
+    //                  /|xxxxxxxxxx|         |             /|xxxxxxxxxxxxxxx|         |                   /|xxxxxxxxxxxxxxxxxxx|             |
+    //                 / |xxxxxxxxxx|         |    MMUK_K0  ||xxxxxxxxxxxxxxx|         |                  / |xxxxxxxxxxxxxxxxxxx|             |
+    //      MMUL_M0    | |xxxxxxxxxx|  --->   |             ||xxxxxxxxxxxxxxx| . . .   |        MMUL_M0  |  |xxxxxxxxxxxxxxxxxxx|             |
+    //        x M0     | |xxxxxxxxxx|         |             \|_______________|_________|          x M0   |  |xxxxxxxxxxxxxxxxxxx|     ...     |
+    //                 | |xxxxxxxxxx|         |              |                         |                 |  |xxxxxxxxxxxxxxxxxxx|             |
+    //                 | |xxxxxxxxxx|         |     x        |       |                 |   =              \ |xxxxxxxxxxxxxxxxxxx|             |
+    //                  \|__________|_________|              |       |                 |                   \|___________________|             |
+    //                   |                    |              |       \/                |                    |                                 |
+    //                   |   ,                |              |_________________________|                    |         .                       |
+    //                   |   ,                |                                                             |         .                       |
+    //                   |   ,                |                                                             |         .                       |
+    //                   |____________________|                                                             |_________________________________|
+    //
+    // Horizontal and vertical arrows show the direction of K loop (main loop in the kernel).
+    // Each output section shown above is a zooomed out version of Figure 1.
+    //
+    // In each iteration of the main loop, LHS matrix traverses towards rightward, and RHS matrix traverses towards downward,
+    // the LHS section of (MMUL_M0 x M0) x MMUL_K0 and RHS section of MMUL_K0 x (MMUL_N0 x N0) is multiplied
+    // "cooperatively" using arm_matrix_multiply calls, and the result is accummulated over the output (DST) section
+    // of size (MMUL_M0 x M0) x (MMUL_N0 x N0) shown with 'x' signs.
+    //
+    // If it was a single thread, this multiplication would have been straightforward with a T_MMUL call.
+    // However, since it involves multiple threads working together using the aforementioned extension, it
+    // works slightly differently.
+    //
+    // Here is how threads access the LHS and RHS matrices:
+    // (Assume MMUL_K0 = MMUL_N0 = MMUL_M0 = 4 because the following diagram is heavily dependent on this)
+    //
+    //                                              (Figure 4)
+    //                               Thread Access Layouts in LHS & RHS matrices
+    //
+    //                   LHS matrix                                                             RHS Matrix
+    //           ___________________________                     __________N0 times______N0 times____________________N0 times_______
+    //          |__T0__|__T1__|__T2__|__T3__|                   |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //          |__T0__| ...                |                   |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //    M0    |   .    .                  |                   |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //   Times  |   .       .               |                   |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|
+    //          |   .           .           |           X
+    //          |__T0__|__T1__|__T2__|__T3__|
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //    M0    |   .    .                  |
+    //   Times  |   .       .               |
+    //          |   .           .           |
+    //          |__T4__|__T5__|__T6__|__T7__|
+    //          |__T8__|__T9__|__T10_|__T11_|
+    //    M0    |   .                       |
+    //   Times  |   .                       |
+    //          |   .                       |
+    //          |__T12_|__T13_|__T14_|__T15_|
+    //    M0    |   .                       |
+    //   Times  |   .                       |
+    //          |   .                       |
+    //          |__T12_|__T13_|__T14_|__T15_|
+    //
+    //
+    // This access layout is designed such that the threads access continuous elements of each matrix (in terms of row/column).
+    // To multiply these large sections, the arm_matrix_multiply call is made for each of the M0xN0 elements. So, for each
+    // combination of m0 and n0 (iterators of M0 and N0 from 0 to M0-1 and N0-1 respectively), one arm_matrix_multiply call is
+    // made, and MMUL_BLOCK_SIZE number of threads compute the result.
+    //
+    // The matrix multiplication taking place in this extension
+    // is an "interleaved" one, because, for example, if m0=0 and n0=0, i.e. the first iteration, we would use the first,
+    // M0-th, 2M0-th and 3M0-th rows of the LHS matrix. Similarly, we jump N0 steps in the RHS matrix. This is how we access
+    // one element for each thread in a single (m0, n0) loop.
+    //
+    //   For example, if we have
+    //          - a 8 x 4 LHS section
+    //          - 4 x 8 RHS section
+    //          - Each vector variable ai, bj represent a 4x1 vector
+    //          - ^T (superscript T) denotes transpose
+    //          - M0 = N0 = 2
+    //          - MMUL_N0 = MMUL_M0 = MMUL_K0 = 4
+    //
+    //                                             (Figure 5)
+    //                              Mathematical view of the Matrix Multiplication
+    //
+    //      LHS                           RHS                                           DST
+    //    [  a1^T  ]            [ b1 b2 b3 b4 b5 b6 b7 ]                [ a1^Tb1  a1^Tb2  a1^Tb3 ... a1^Tb7 ]
+    //    [  a2^T  ]                                    4 x 8           [ a2^Tb1  a2^Tb2  a2^Tb3 ... a2^Tb7 ]
+    //    [  a3^T  ]                                                    [                                   ]
+    //    [  a4^T  ]                                                =   [   .       .                       ]
+    //    [  a5^T  ]        X                                           [   .          .                    ]
+    //    [  a6^T  ]                                                    [   .             .                 ]
+    //    [  a7^T  ]                                                    [                                   ]
+    //    [  a8^T  ]                                                    [ a7^Tb1  a7^Tb2  a7^Tb3 ... a7^Tb7 ]
+    //              8 x 4                                                                                     8 x 8
+    //
+    //
+    //  For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:
+    //
+    //    [  a1^T  ]            [  b1 b3 b5 b7 ]                [ a1^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a3^T  ]        x                   4 x 4     =     [ a3^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a5^T  ]                                            [ a5^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a7^T  ]                                            [ a7^Tb1  a7^Tb3  a7^Tb5  a7^Tb7 ]
+    //              4 x 4                                                                         4 x 4
+    //  The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.
+    //  When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.
+    //
+
+    // Get thread coordinates within an mmul block (of size MMUL_BLOCK_SIZE)
+    // Since threads are grouped in x dimension, the modular of x-dim global id
+    // wrt the MMUL_BLOCK_SIZE is the thread id in the group, ranging from 0 to
+    // MMUL_BLOCK_SIZE-1. Because the thread numbering is in row-major order.
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    // The unclamped dst coordinates can be calculated easily from the output section coordinates and the thread coordinates (see above figure).
+
+    // See Figure 1 & 2. Thread step size is N0 and M0,
+    //                   Section step size is N0 x MMUL_N0 and M0 x MMUL_M0
+    //                   respectively for x, y dimensions.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, M0, 1, a);
+        // A tile of K0xN0 but K0 must be set to 1
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_MMUL_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=4). K must be a multiple of MMUL_K0
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in DST matrix
+ * @param[in]  N                                  Number of columns in DST matrix
+ * @param[in]  K                                  Number of rows in LHS and RHS matrices, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates
+    uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    uint thread_x  = thread_id % MMUL_N0;
+    uint thread_y  = (thread_id / MMUL_N0);
+
+    // See Nt/Nt kernel for explanations
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    uint lhs_x = dst_y;
+    uint lhs_y = thread_x;
+
+    // Starting RHS coordinates
+    uint rhs_x = dst_x;
+    uint rhs_y = thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, 1, M0, a);
+        TILE(DATA_TYPE, 1, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[0].s[m0], b[0].s[n0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_T_NT)
+
+#if defined(MAT_MUL_NATIVE_MMUL_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of columns in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get block coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within a block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, M0, 1, a);
+        // A tile of N0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[m0].s[0], b[n0].s[0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_NT_T)
+
+#if defined(MAT_MUL_NATIVE_MMUL_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul) using MMUL: LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The tile's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=1).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The MMUL block dimension (MMUL_M0, MMUL_N0, MMUL_K0) must be passed at compile time using -DMMUL_M0, -DMMUL_N0 and -DMMUL_K0 (e.g. -DMMUL_M0=4, -DMMUL_N0=4, -DMMUL_K0=4).
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_MMUL_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: F32/F16
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ * @param[in]  M                                  Number of rows in LHS matrix
+ * @param[in]  N                                  Number of columns in RHS matrix
+ * @param[in]  K                                  Number of rows in LHS matrix and columns in RHS matrix, which is multiple of MMUL_K0.
+ */
+__kernel void mat_mul_native_mmul_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER),
+    const int M,
+    const int N,
+    const int K)
+{
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0)
+    // For explanations on how this kernel works, please refer to NT/NT kernel. This kernel makes little modifications to it.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get block coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within a block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Starting destination coordinates
+    // Note: We need to clamp dst_x and dst_y because we always need to execute a complete MMUL block! Only after the matrix multiplication
+    // part can we exit the kernel if it is out-of-bound. Remember, we have a cooperative matrix multiplication. Therefore, we need a full block to get the correct results
+    // Although we will never write out-of-bound, we still need this clamp to ensure that we do not read out-of-bound either.
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    // MMUL extension accumulate the result in F32 for both F32 and F16
+    TILE(float, M0, N0, c_f32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_f32[i].v = 0;
+    })
+
+    for(int k = 0; k < K; k += MMUL_K0)
+    {
+        // A tile of K0xM0 but K0 must be set to 1
+        TILE(DATA_TYPE, 1, M0, a);
+        // A tile of N0xK0 but K0 must be set to 1
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c_f32[m0].s[n0] = arm_matrix_multiply(a[0].s[m0], b[n0].s[0], c_f32[m0].s[n0]);
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_N0 * sizeof(DATA_TYPE);
+    }
+
+    // For threads "outside" of the dst bound, we do not write but we have to "read" (arm_matrix_multiply). That's why this needs to happen after arm_matrix_multiply
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if defined(HALF_PRECISION)
+    TILE(DATA_TYPE, M0, N0, c);
+
+    // Conversion required for the half precision
+    LOOP_UNROLLING(int, m0, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            c[m0].s[n0] = c_f32[m0].s[n0];
+        })
+    })
+#else // defined(HALF_PRECISION)
+#define c c_f32
+#endif // defined(HALF_PRECISION)
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (c[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+
+#undef MMUL_BLOCK_SIZE
+}
+#endif // defined(MAT_MUL_NATIVE_MMUL_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
new file mode 100644
index 0000000000..7f81ac4549
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized.cl
@@ -0,0 +1,833 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for integer datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
+{
+    TILE(int, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor in a transposed fashion
+        // in order to use T_MMUL_NT_T macro because only this macro
+        // can utilize dot product instruction for Int8/UInt8 by
+        // directly multiplying the rows of Lhs and Rhs tensors.
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor in a transposed fashion.
+        // See the main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output bounded activation functions.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_NT_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0, K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if((K % K0) != 0)
+    // Leftover loop
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                a_sum[0].s[i] += (int)a[i].s[j];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_NT_T)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs/rhs tensors in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += K0 * rhs_stride_y;
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs/rhs tensors in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, N0,
+            {
+                b_sum[0].s[j] += (int)b[j].s[i];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += 1 * rhs_stride_y;
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)(RHS_OFFSET)) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DPARTIAL_STORE_N0 and -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_N0=2, -DPARTIAL_STORE_M0=3)
+ * @note The fused activation function used should be passed with -DACTIVATION_TYPE, -DA_VAL and -DB_VAL are used for min and max output with the relu and bounded relu operations.
+ * @note The value of 0 in quantized format is equivalent to the quantization offset of the output data. This should be passed with -DZERO_POINT
+ * @note The dimension K must be passed at compile time using -DK (e.g. -DK=6)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_T_T)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 1, 2, 3, 4, 8, 16
+ * @note Values > 8 for M0, N0 and K0 are not expected to be efficient
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x = GET_SPATIAL_IDX(0, N0, PARTIAL_STORE_N0);
+    const uint y = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    const uint z = GET_SPATIAL_IDX(2, 1, 0);
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += y * sizeof(DATA_TYPE) + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += x * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, acc);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        acc[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    int k;
+    for(k = 0; k <= K - K0; k += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, K0, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, K0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, K0,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += K0 * sizeof(DATA_TYPE);
+    }
+
+#if((K % K0) != 0)
+    /* Leftover Loop */
+    for(; k < K; ++k)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            a[i].v = 0;
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            b[i].v = 0;
+        })
+
+        // Load tile from the lhs tensor in a transposed fashion
+        // see mat_mul_native_quantized_nt_nt main loop for more explanation
+        T_LOAD_TRANSPOSED(DATA_TYPE, 1, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+
+        // Load tile from the rhs tensor
+        T_LOAD(DATA_TYPE, N0, 1, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, int, M0, N0, 1, NT, T, a, b, acc);
+
+        LOOP_UNROLLING(int, i, 0, 1, 1,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, M0,
+            {
+                a_sum[0].s[j] += (int)a[j].s[i];
+            })
+        })
+
+        LOOP_UNROLLING(int, i, 0, 1, N0,
+        {
+            LOOP_UNROLLING(int, j, 0, 1, 1,
+            {
+                b_sum[0].s[i] += (int)b[i].s[j];
+            })
+        })
+
+        lhs_offset_first_element_in_bytes += 1 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += 1 * sizeof(DATA_TYPE);
+    }
+#endif // ((K % K0) != 0)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            acc[i].s[j] -= ((int)RHS_OFFSET) * a_sum[0].s[i] + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+
+    const bool x_cond = PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0;
+    const bool y_cond = PARTIAL_STORE_M0 != 0 && get_global_id(1) == 0;
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, acc, x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, accq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, acc, accq);
+
+    T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_POINT, A_VAL, B_VAL, accq, accq);
+
+    TILE(int, M0, 1, indirect_buffer);
+    LOOP_UNROLLING(int, _i, 0, 1, M0,
+    {
+        indirect_buffer[_i].v = min(_i, select(M0 - 1, PARTIAL_STORE_M0 - 1, y_cond));
+    });
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, 0, dst_stride_y, x_cond, accq, indirect_buffer);
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_T_T)
diff --git a/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
new file mode 100644
index 0000000000..fdfb75d39c
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mat_mul_quantized_mmul.cl
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#ifdef BIAS
+// This function performs in-place bias addition for integer datatype when bias is enabled.
+// Note The tile's dimensions used for the LHS and RHS matrices (M0, N0) must be passed at compile time using -DN0, -DM0 (e.g. -DN0=8, -DM0=4).
+inline void perform_bias_addition(uchar *bias_ptr, uint bias_offset_first_element_in_bytes, TILE(int, M0, N0, acc), uint x)
+{
+    TILE(int, 1, N0, bias_tile);
+
+    // below expands to use bias_ptr and bias_offset_first_element_in_bytes
+    T_LOAD(int, 1, N0, BUFFER, bias, x, 0, 1, 0, bias_tile);
+
+    // c = c + bias[broadcasted]
+    T_ELTWISE_BROADCAST_ADD_X(int, M0, N0, acc, bias_tile, acc);
+}
+#endif // defined(BIAS)
+
+#define MMUL_BLOCK_SIZE (MMUL_M0 * MMUL_N0) // MMUL block size for the output matrix
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS non-transposed - buffer only
+ *
+ * @note the "batch" here expresses the number of matrix multiplications to run in parallel. However, it
+ *       should NOT be confused with the batch size of the model. For NHWC the "batch" is the "H" dimension
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=uchar)
+ * @note The block's dimensions used for the LHS and RHS matrices (M0, N0 and K0) must be passed at
+ *       compile time using -DN0, -DM0 and -DK0 (e.g. -DN0=8, -DM0=4, -DK0=4).
+ * @note The number of leftover outputs rows/columns must be passed using -DN0_LEFTOVER and -DM0_LEFTOVER
+ *       (e.g. -DN0_LEFTOVER=2, -DM0_LEFTOVER=3)
+ * @note The dimensions M, N, K must be passed at compile time using -DK (e.g. -DM=5, -DN=8, -DK=6).
+ *       K must be a multiple of 16.
+ * @note MMUL block sizes must be passed at compile time using -DMMUL_K0, -DMMUL_M0, -DMMUL_N0
+ *       (e.g. -DMMUL_K0=16, -DMMUL_M0=4, -DMMUL_N0=4)
+ * @note If there is bias -DBIAS option must be passed at compile time
+ * @note Quantization offsets of lhs, rhs and dst tensors must be passed at compile time using -DLHS_OFFSET,
+ *       -DRHS_OFFSET, -DDST_OFFSET (e.g. -DLHS_OFFSET=10, -DRHS_OFFSET=0, -DDST_OFFSET=-6)
+ * @note Effective quantization multiplier and shift for the destination tensor must be passed at compile time using
+ *       -DDST_MULTIPLIER and -DDST_SHIFT (e.g. -DDST_MULTIPLIER=2091, -DST_SHIFT=8)
+ * @note The kernel name in uppercase must be passed at compile time (e.g. -DMAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ * @note For a generic view on how the MMUL works, see mat_mul_mmul.cl
+ *
+ * @param[in]  lhs_ptr                            Pointer to the lhs matrix. Supported data types: QASYMM8_SIGNED/QASYMM8
+ * @param[in]  lhs_stride_y                       Stride of the lhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  lhs_stride_z                       Stride of the lhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  lhs_w                              The width of the lhs tensor
+ * @param[in]  lhs_h                              The height of the lhs tensor
+ * @param[in]  lhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the lhs matrix
+ * @param[in]  rhs_ptr                            Pointer to the rhs matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  rhs_stride_y                       Stride of the rhs matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the rhs tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  rhs_w                              The width of the rhs tensor
+ * @param[in]  rhs_h                              The height of the rhs tensor
+ * @param[in]  rhs_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the rhs matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  bias_w                             (Optional) The size of the width dimension of the bias tensor
+ * @param[in]  bias_h                             (Optional) The size of the height dimension of the bias tensor
+ * @param[in]  bias_n                             (Optional) The size of the depth dimension of the bias tensor
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[out] dst_ptr                            Pointer to the dst matrix. Supported data types: same as @p lhs_ptr
+ * @param[in]  dst_stride_y                       Stride of the dst matrix in Y (2nd) dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the dst tensor in Z (3rd) dimension (in bytes)
+ * @param[in]  dst_w                              The width of the dst tensor
+ * @param[in]  dst_h                              The height of the dst tensor
+ * @param[in]  dst_n                              Number of the matrices (buffers) in the batch
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the dst matrix
+ */
+__kernel void mat_mul_native_quantized_mmul_nt_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    // The explanation of how this kernel works is very similar to the explanation given in
+    // mat_mul_mmul.cl. The MMUL logic, and terminology is the same. The only difference is
+    // in quantization multiplication, the MMUL block sizes are (4 x 16) for Lhs matrix and
+    // (16 x 4) for Rhs matrix, resulting in (4 x 4) MMUL block size for the destination.
+    //
+    // Figures 1, 2 and 3 in the previous explanation works the same. Since the Lhs and Rhs
+    // MMUL block sizes are different in quantized extension, the thread access pattern is
+    // slightly different. We can redraw Figure 4 (Thread access pattern) as follows:
+    //
+    //                                 (Modified Figure 4 from mat_mul_mmul.cl)
+    //                               Thread Access Layouts in LHS & RHS matrices
+    //
+    //                                    LHS matrix
+    //                 4 times      4 times          4 times        4 times
+    //           _______________________________________________________________
+    //          |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|
+    //          |T0_| ...                                                       |
+    //    M0    |   .    .                                                      |
+    //   Times  |   .       .                                                   |
+    //          |   .           .                                               |
+    //          |T0_|T0_|T0_|T0_|T1_|T1_|T1_|T1_|T2_|T2_|T2_|T2_|T3_|T3_|T3_|T3_|
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //    M0    |   .    .                                                      |
+    //   Times  |   .       .                                                   |
+    //          |   .           .                                               |
+    //          |T4_|T4_|T4_|T4_|T5_|T5_|T5_|T5_|T6_|T6_|T6_|T6_|T7_|T7_|T7_|T7_|
+    //          |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|
+    //    M0    |   .                                                           |
+    //   Times  |   .                                                           |
+    //          |   .                                                           |
+    //          |T8_|T8_|T8_|T8_|T9_|T9_|T9_|T9_|T10|T10|T10|T10|T11|T11|T11|T11|
+    //    M0    |   .                                                           |
+    //   Times  |   .                                                           |
+    //          |   .                                                           |
+    //          |T12|T12|T12|T12|T13|T13|T13|T13|T14|T14|T14|T14|T15|T15|T15|T15|
+    //
+    //
+    //                                                  RHS Matrix
+    //
+    //                   __________N0 times______N0 times____________________N0 times_______
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //       4 times    |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T0__| ... |__T0__|__T1__| ...  |__T1__| ... |__T3__| ... |__T3__|
+    //                  |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //       4 times    |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //                  |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //           X      |__T4__| ... |__T4__|__T5__| ...  |__T5__| ... |__T7__| ... |__T7__|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //       4 times    |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T8__| ... |__T8__|__T9__| ...  |__T9__| ... |__T11_| ... |__T11_|
+    //                  |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //       4 times    |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //                  |__T12_| ... |__T12_|__T13_| ...  |__T13_| ... |__T15_| ... |__T15_|
+    //                  |__T12_|_____|__T12_|__T13_|______|__T13_|_____|__T15_|_____|__T15_|
+    //
+    //
+    // The logic behind this thread access pattern is already descried in the explanation
+    // in mat_mul_mmul.cl. The only change is threads accesses are extended to 4 elements
+    // from 1, in rightward direction in Lhs, and in downward direction in Rhs, because they
+    // are now operating on 4 char/uchar's (again 32-bit data), instead of one 32-bit floating point.
+    //
+    // The mathematical view of the matrix multiplication explained in Figure 5 also holds for this,
+    // except the dimension 4 is 16 instead, but the vector notations do not change, i.e. it's as follows:
+    //
+    //   Settings:
+    //          - a 8 x 16 LHS section
+    //          - 16 x 8 RHS section
+    //          - Each vector variable ai, bj represent a 16x1 vector
+    //          - ^T (superscript T) denotes transpose
+    //          - M0 = N0 = 2
+    //          - MMUL_N0 = MMUL_M0 = 4, MMUL_K0 = 16
+    //
+    //
+    //                                             (Modified Figure 5)
+    //                              Mathematical view of the Matrix Multiplication
+    //
+    //      LHS                           RHS                                           DST
+    //    [  a1^T  ]            [ b1 b2 b3 b4 b5 b6 b7 ]                [ a1^Tb1  a1^Tb2  a1^Tb3 ... a1^Tb7 ]
+    //    [  a2^T  ]                                    16 x 8          [ a2^Tb1  a2^Tb2  a2^Tb3 ... a2^Tb7 ]
+    //    [  a3^T  ]                                                    [                                   ]
+    //    [  a4^T  ]                                                =   [   .       .                       ]
+    //    [  a5^T  ]        X                                           [   .          .                    ]
+    //    [  a6^T  ]                                                    [   .             .                 ]
+    //    [  a7^T  ]                                                    [                                   ]
+    //    [  a8^T  ]                                                    [ a7^Tb1  a7^Tb2  a7^Tb3 ... a7^Tb7 ]
+    //              8 x 16                                                                                     8 x 8
+    //
+    //
+    //  For the first iteration, i.e. (m0, n0) = (0, 0), the arm_matrix_multiply would multiply the following matrices:
+    //
+    //    [  a1^T  ]            [  b1 b3 b5 b7 ]                [ a1^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a3^T  ]        x                   4 x 4     =     [ a3^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a5^T  ]                                            [ a5^Tb1  a1^Tb3  a1^Tb5  a1^Tb7 ]
+    //    [  a7^T  ]                                            [ a7^Tb1  a7^Tb3  a7^Tb5  a7^Tb7 ]
+    //              4 x 4                                                                         4 x 4
+    // The elements calculated in the 4x4 output block are the "interleaved" elements in the DST above.
+    // When we follow for each combination of (m0, n0), every element of the DST matrix "section" is filled.
+    //
+    // Please refer to mat_mul_mmul.cl for more details.
+
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = K0 * thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = K0 * thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_w; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to K0
+        TILE(DATA_TYPE, M0, K0, a);
+        // A tile of K0xN0 but K0 must be set to K0
+        TILE(DATA_TYPE, K0, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+            LOOP_UNROLLING(int, m0, 0, 1, M0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(a[m0].v, vec_b, c[m0].s[n0]);
+            })
+
+#if LHS_OFFSET != 0
+            // Column Sum of B: Calculate the sum of columns by multiplying B
+            // with a matrix of 1's from Left
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+#endif // LHS_OFFSET != 0s
+        })
+
+#if RHS_OFFSET != 0
+        // Row Sum of A: Calculate the sum of rows by multiplying A with
+        // a matrix of 1's from Right
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);
+        })
+#endif // RHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS non-transposed, RHS transposed - buffer only
+ *
+ * Supported block configurations:
+ *  - M0 > 0
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_nt_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = K0 * thread_x;
+    const uint lhs_y = dst_y;
+
+    // Starting RHS coordinates
+    const uint rhs_x = K0 * thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_w; k += MMUL_K0)
+    {
+        // A tile of M0xK0 but K0 must be set to K0
+        TILE(DATA_TYPE, M0, K0, a);
+        // A tile of K0xN0 but K0 must be set to K0
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(a[m0].v, b[n0].v, c[m0].s[n0]);
+            })
+        })
+
+#if RHS_OFFSET != 0
+        // Row Sum of A: Calculate the sum of rows by multiplying A with
+        // a matrix of 1's from Right
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            a_sum[0].s[m0] = arm_matrix_multiply(a[m0].v, vec_1, a_sum[0].s[m0]);
+        })
+#endif // RHS_OFFSET != 0
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, b[n0].v, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+        rhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_NT_T)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS non-transposed
+ *
+ * Supported block configurations:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_t_nt(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = K0 * thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = dst_x;
+    const uint rhs_y = K0 * thread_y;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_h; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, K0, N0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, K0, N0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_a = (VEC_DATA_TYPE(DATA_TYPE, K0))(a[0].s[m0], a[1].s[m0], a[2].s[m0], a[3].s[m0]);
+
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                VEC_DATA_TYPE(DATA_TYPE, K0)
+                vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+                c[m0].s[n0] = arm_matrix_multiply(vec_a, vec_b, c[m0].s[n0]);
+            })
+
+#if RHS_OFFSET != 0
+            // Row Sum of A: Calculate the sum of rows by multiplying A with
+            // a matrix of 1's from Right
+            a_sum[0].s[m0] = arm_matrix_multiply(vec_a, vec_1, a_sum[0].s[m0]);
+#endif // RHS_OFFSET != 0
+        })
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_b = (VEC_DATA_TYPE(DATA_TYPE, K0))(b[0].s[n0], b[1].s[n0], b[2].s[n0], b[3].s[n0]);
+
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, vec_b, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * rhs_stride_y;
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_NT)
+
+#if defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)
+/** This OpenCL kernel performs the batch matrix multiplication (BatchMatMul): LHS transposed, RHS transposed
+ *
+ * Supported block configurations:
+ *  - M0 = 1, 2, 3, 4, 8, 16
+ *  - N0 = 1, 2, 3, 4, 8, 16
+ *  - K0 = 4
+ *
+ * Similar to mat_mul_native_quantized_mmul_nt_nt()
+ */
+__kernel void mat_mul_native_quantized_mmul_t_t(
+    TENSOR3D_T(lhs, BUFFER),
+    TENSOR3D_T(rhs, BUFFER),
+#ifdef BIAS
+    TENSOR3D_T(bias, BUFFER),
+#endif // defined(BIAS)
+    TENSOR3D_T(dst, BUFFER))
+{
+    const uint x0 = get_global_id(0); // [0, (N / N0) * MMUL_M0)
+    // The upper limit is a simplified version of (N / N0) / MMUL_N0) * MMUL_BLOCK_SIZE)
+    const uint y0 = get_global_id(1); // [0, (M / M0) / MMUL_M0)
+    const uint z  = get_global_id(2); // Batch
+
+    // Get section coordinates
+    const uint section_x = (x0 / MMUL_BLOCK_SIZE);
+    const uint section_y = y0;
+
+    // Get thread coordinates within an mmul block
+    const uint thread_id = (x0 % MMUL_BLOCK_SIZE);
+    const uint thread_x  = thread_id % MMUL_N0;
+    const uint thread_y  = (thread_id / MMUL_N0);
+
+    // Calculate dst coordinates
+    const uint dst_x_unclamped = thread_x * N0 + section_x * N0 * MMUL_N0;
+    const uint dst_y_unclamped = thread_y * M0 + section_y * M0 * MMUL_M0;
+    const uint dst_x           = min(dst_x_unclamped, (uint)(N - N0));
+    const uint dst_y           = min(dst_y_unclamped, (uint)(M - M0));
+
+    // Starting LHS coordinates
+    const uint lhs_x = dst_y;
+    const uint lhs_y = K0 * thread_x;
+
+    // Starting RHS coordinates
+    const uint rhs_x = K0 * thread_y;
+    const uint rhs_y = dst_x;
+
+    // Compute LHS/RHS/DST matrix address
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+
+    // Initialize the accumulators
+    TILE(int, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = K * ((int)LHS_OFFSET) * ((int)RHS_OFFSET);
+    })
+
+    // Calculate row and column sums
+    TILE(int, 1, N0, b_sum);
+    b_sum[0].v = 0;
+
+    TILE(int, 1, M0, a_sum);
+    a_sum[0].v = 0;
+
+    VEC_DATA_TYPE(DATA_TYPE, K0)
+    vec_1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(1, 1, 1, 1);
+
+    for(int k = 0; k < lhs_h; k += MMUL_K0)
+    {
+        TILE(DATA_TYPE, K0, M0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load tile from the lhs/rhs tensors
+        T_LOAD(DATA_TYPE, K0, M0, BUFFER, lhs, 0, 0, 1, lhs_stride_y, a);
+        T_LOAD(DATA_TYPE, N0, K0, BUFFER, rhs, 0, 0, 1, rhs_stride_y, b);
+
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            VEC_DATA_TYPE(DATA_TYPE, K0)
+            vec_a = (VEC_DATA_TYPE(DATA_TYPE, K0))(a[0].s[m0], a[1].s[m0], a[2].s[m0], a[3].s[m0]);
+
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+                c[m0].s[n0] = arm_matrix_multiply(vec_a, b[n0].v, c[m0].s[n0]);
+            })
+#if RHS_OFFSET != 0
+            // Row Sum of A: Calculate the sum of rows by multiplying A with
+            // a matrix of 1's from Right
+            a_sum[0].s[m0] = arm_matrix_multiply(vec_a, vec_1, a_sum[0].s[m0]);
+#endif // RHS_OFFSET != 0
+        })
+
+#if LHS_OFFSET != 0
+        // Column Sum of B: Calculate the sum of columns by multiplying B
+        // with a matrix of 1's from Left
+        LOOP_UNROLLING(int, n0, 0, 1, N0,
+        {
+            b_sum[0].s[n0] = arm_matrix_multiply(vec_1, b[n0].v, b_sum[0].s[n0]);
+        })
+#endif // LHS_OFFSET != 0
+
+        lhs_offset_first_element_in_bytes += MMUL_K0 * lhs_stride_y;
+        rhs_offset_first_element_in_bytes += MMUL_K0 * sizeof(DATA_TYPE);
+    }
+
+    // Do not write if the coordinates are out of bound
+    // But, read has to happen as arm_matrix_multiply() expects certain number of calls
+    if(dst_x_unclamped >= N || dst_y_unclamped >= M)
+    {
+        return;
+    }
+
+#if RHS_OFFSET != 0 || LHS_OFFSET != 0
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        const int A = ((int)RHS_OFFSET) * a_sum[0].s[i];
+        LOOP_UNROLLING(int, j, 0, 1, N0,
+        {
+            c[i].s[j] -= A + ((int)(LHS_OFFSET)) * b_sum[0].s[j];
+        })
+    })
+#endif // RHS_OFFSET != 0 || LHS_OFFSET != 0
+
+#ifdef BIAS
+    perform_bias_addition(bias_ptr, bias_offset_first_element_in_bytes, c, dst_x);
+#endif // defined(BIAS)
+
+    // Quantize the tile
+    TILE(DATA_TYPE, M0, N0, cq);
+    T_QUANTIZE8_ASYMMETRIC(int, DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+
+    if(dst_x + N0 <= N || N0_LEFTOVER == 0)
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE(N0)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+    else
+    {
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            if(dst_y + m0 < M || M0_LEFTOVER == 0)
+            {
+                VSTORE_PARTIAL(N0, N0_LEFTOVER)
+                (cq[m0].v, 0, (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + m0 * dst_stride_y));
+            }
+        })
+    }
+}
+#endif // defined(MAT_MUL_NATIVE_QUANTIZED_MMUL_T_T)
diff --git a/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
new file mode 100644
index 0000000000..22abf64874
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2019-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void mean_stddev_normalization(
+    IMAGE_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    IMAGE_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Image in = CONVERT_TO_IMAGE_STRUCT(input);
+#ifdef IN_PLACE
+    Image out = in;
+#else  /* IN_PLACE */
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+#endif /* IN_PLACE */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    sum = 0.f;
+#ifdef MEANSTDNORM_HALF
+    VEC_DATA_TYPE(float, VEC_SIZE)
+#else  /* MEANSTDNORM_HALF */
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#endif /* MEANSTDNORM_HALF */
+    sum_sq = 0.f;
+    // Calculate partial sum
+    int i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        // Load data
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
+
+        sum += data;
+#ifdef MEANSTDNORM_HALF
+        VEC_DATA_TYPE(float, VEC_SIZE)
+        dsq = CONVERT(data * data, VEC_DATA_TYPE(float, VEC_SIZE));
+        sum_sq += dsq;
+#else  /* MEANSTDNORM_HALF */
+        sum_sq += data * data;
+#endif /* MEANSTDNORM_HALF */
+    }
+    // Perform reduction
+    sum    = SUM_REDUCE(sum, VEC_SIZE);
+    sum_sq = SUM_REDUCE(sum_sq, VEC_SIZE);
+
+#if VEC_SIZE > 1
+#define sum sum.s0
+#define sum_sq sum_sq.s0
+#endif // VEC_SIZE > 1
+
+    // Left-overs loop
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
+
+        sum += data;
+        sum_sq += data * data;
+    }
+
+    DATA_TYPE mean       = sum / WIDTH;
+    DATA_TYPE var        = (sum_sq / WIDTH) - (mean * mean);
+    DATA_TYPE stddev_inv = 1.f / sqrt(var + EPSILON);
+
+    i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        res = (data - mean) * stddev_inv;
+        VSTORE(VEC_SIZE)
+        (res, 0, (__global DATA_TYPE *)offset(&out, i, 0));
+    }
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
+
+        *((__global DATA_TYPE *)offset(&out, i, 0)) = (data - mean) * stddev_inv;
+    }
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH) */
diff --git a/src/core/CL/cl_kernels/common/memset.cl b/src/core/CL/cl_kernels/common/memset.cl
new file mode 100644
index 0000000000..9ff25f3af4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/memset.cl
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: All.
+ * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] value                                The value used to fill the pages of the tensor
+ */
+__kernel void memset(
+    TENSOR3D_DECLARATION(tensor))
+{
+    Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = (DATA_TYPE)(CONSTANT_VALUE);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else  // !defined(VEC_SIZE)
+    *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/minmax_layer.cl b/src/core/CL/cl_kernels/common/minmax_layer.cl
new file mode 100644
index 0000000000..49356451df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/minmax_layer.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+/** This function identifies the min and maximum value of an input 3D tensor.
+ *
+ * @note The width, height and depth of the input tensor must be provided at compile time using -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void minmax_layer(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Vector   dst = CONVERT_TO_VECTOR_STRUCT(dst);
+
+    float4 min_value     = (float4)FLT_MAX;
+    float4 max_value     = (float4) - FLT_MAX;
+    float2 min_max_value = (float2)(FLT_MAX, -FLT_MAX);
+
+    for(int z = 0; z < DEPTH; ++z)
+    {
+        for(int y = 0; y < HEIGHT; ++y)
+        {
+            int             x        = 0;
+            __global float *src_addr = (__global float *)(src.ptr + y * src_stride_y + z * src_stride_z);
+
+            for(; x <= (int)(WIDTH - 8); x += 8)
+            {
+                float8 value = *(src_addr + x);
+
+                min_value = select(value.s0123, min_value, min_value < value.s0123);
+                min_value = select(value.s4567, min_value, min_value < value.s4567);
+
+                max_value = select(value.s0123, max_value, max_value > value.s0123);
+                max_value = select(value.s4567, max_value, max_value > value.s4567);
+            }
+
+            for(; x < WIDTH; ++x)
+            {
+                float value = *(src_addr + x);
+
+                min_max_value.s0 = min(min_max_value.s0, value);
+                min_max_value.s1 = max(min_max_value.s1, value);
+            }
+        }
+    }
+
+    // Perform min/max reduction
+    min_value.s01 = min(min_value.s01, min_value.s23);
+    min_value.s0  = min(min_value.s0, min_value.s1);
+    max_value.s01 = max(max_value.s01, max_value.s23);
+    max_value.s0  = max(max_value.s0, max_value.s1);
+
+    min_max_value.s0 = min(min_max_value.s0, min_value.s0);
+    min_max_value.s1 = max(min_max_value.s1, max_value.s0);
+
+    if(min_max_value.s0 == min_max_value.s1)
+    {
+        min_max_value.s0 = 0.0f;
+        min_max_value.s1 = 1.0f;
+    }
+
+    // Store min and max
+    vstore2(min_max_value, 0, (__global float *)dst.ptr);
+}
+#endif // defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/nonmax.cl b/src/core/CL/cl_kernels/common/nonmax.cl
new file mode 100644
index 0000000000..702e635a89
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/nonmax.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs Non maxima suppression over a 3x3 window on a given image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p scr_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_max_suppression(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    if(all(vc == (DATA_TYPE)0))
+    {
+        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
+
+        return;
+    }
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
+    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
+    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
+
+    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/common/pad_layer.cl b/src/core/CL/cl_kernels/common/pad_layer.cl
new file mode 100644
index 0000000000..5ae4ec884d
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pad_layer.cl
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_SELECT SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSETS VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VEC_SIZE)
+#define SCALAR_COND(x) CONVERT((VEC_SELECT)x == (VEC_SELECT)1, VEC_SELECT)
+
+#if defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
+/** Perform a pad operation when PaddingMode is CONSTANT
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
+ *       -DTHREADS_TO_SKIP_BEFORE compile flag, e.g. -DTHREADS_TO_SKIP_BEFORE=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note In case pad left is more than the vector size, the thread from which to skip along the X axis for pad right must be passed using the
+ *       -DTHREADS_TO_SKIP_AFTER compile flag, e.g. -THREADS_TO_SKIP_AFTER=1. This is defined as ((SRC_WIDTH + PAD_X_BEFORE) / VEC_SIZE)
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If pad also needs to be added to the batch of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
+ *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
+ */
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
+                                 TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+                                 ,
+                                 uint batch
+#endif // defined(PAD_W_BEFORE)
+                                )
+{
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    // If true, write only padding values; no reads performed
+    uint cond = 0;
+#if defined(THREADS_TO_SKIP_BEFORE)
+    cond |= x < THREADS_TO_SKIP_BEFORE || x > THREADS_TO_SKIP_AFTER;
+#endif // defined(THREADS_TO_SKIP_BEFORE)
+#if defined(PAD_Y_BEFORE)
+    cond |= y < PAD_Y_BEFORE || y >= (SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+
+    if(cond)
+    {
+        VEC_TYPE const_vals0 = (VEC_TYPE)CONST_VAL;
+        STORE_VECTOR_SELECT(const_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+    }
+    else
+    {
+        // Calculate input's coordinates based on output's
+        int w = 0;
+#if defined(THREADS_TO_SKIP_BEFORE)
+        x -= THREADS_TO_SKIP_BEFORE;
+#endif // defined(THREADS_TO_SKIP_BEFORE)
+#if defined(PAD_Y_BEFORE)
+        y -= PAD_Y_BEFORE;
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+        z -= PAD_Z_BEFORE;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+        w -= PAD_W_BEFORE * SRC_DEPTH;
+#endif // defined(PAD_W_BEFORE)
+        x *= VEC_SIZE;
+        x -= PAD_X_BEFORE_REMAINDER;
+
+        // Check for out of bound reads and clamp X coordinate
+        uint cond_left  = x < 0;
+        uint cond_right = (x + VEC_SIZE) > SRC_WIDTH;
+        x               = clamp(x, 0, (SRC_WIDTH - VEC_SIZE));
+
+        // Calculate input's address
+        __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_stride_x + y * src_stride_y + z * src_stride_z + w * (int)src_stride_z;
+
+        // Read values and rotate them properly if they would have been across paddings
+        VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, PAD_X_BEFORE_REMAINDER), SCALAR_COND(cond_left));
+        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, VEC_SIZE_LEFTOVER_READ), SCALAR_COND(cond_right));
+
+        // Check what values would be padding and replace them with the constant value
+        VEC_INT xs_out = (VEC_INT)(get_global_id(0) * VEC_SIZE) + VEC_OFFS(int, VEC_SIZE);
+        VEC_INT conds  = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+        src_vals0      = select(src_vals0, (VEC_TYPE)CONST_VAL, CONVERT(conds, VEC_SELECT));
+
+        // Store values in bounds
+        STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+    }
+}
+#endif // defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
+
+#if defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note Number of values to the left when operating across left padding must be passed using the -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
+                                          TENSOR3D_DECLARATION(dst))
+{
+    // Get current thread position
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Define conditions based on the thread X position w.r.t. pad left and right
+    const int x_out_first         = x * VEC_SIZE;
+    const int x_out_last          = x_out_first + VEC_SIZE;
+    const int is_before_pad_left  = (x_out_last <= PAD_X_BEFORE);
+    const int is_across_pad_left  = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+    const int is_inside_input     = (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_across_pad_right = (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_after_pad_right  = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+    // Calculate base pointers
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+    Tensor3D        dst      = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Calculate input tensor's offset based on the defined conditions
+    int x_offset = 0;
+    x_offset     = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+    x_offset     = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+    x_offset     = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+    x_offset     = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+    int neg_offs = x_offset < 0;
+    x_offset     = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+    // Load input values from the computed offset
+    int y_in = y;
+    int z_in = z;
+#if defined(PAD_Y_BEFORE)
+    y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+    y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+    z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+    src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+    VSTORE(VEC_SIZE)
+    ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+    VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+    // Choose rearrangement policy based on the defined conditions
+    src_vals0 = select(src_vals0, SYMM_REFL_LEFT(src_vals0, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
+    src_vals0 = select(src_vals0, SYMM_REFL_RIGHT(src_vals0, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
+    src_vals0 = select(src_vals0, REVERSE(src_vals0, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+    src_vals0 = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+    // Store values in bounds
+    STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
diff --git a/src/core/CL/cl_kernels/common/permute.cl b/src/core/CL/cl_kernels/common/permute.cl
new file mode 100644
index 0000000000..1a97ca7495
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/permute.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/**Perform a permute operation on an input tensor of Shape DCHW.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute(TENSOR4D_DECLARATION(input),
+                      TENSOR4D_DECLARATION(output))
+
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+
+    int out_index[4] = { 0 };
+    int in_index[4]  = { 0 };
+
+    in_index[0] = get_global_id(0);            // W
+    in_index[1] = get_global_id(1);            // H
+    in_index[2] = get_global_id(2) % DEPTH_IN; // C
+    in_index[3] = get_global_id(2) / DEPTH_IN; // B
+
+    out_index[0] = in_index[P1];
+    out_index[1] = in_index[P2];
+    out_index[2] = in_index[P3];
+    out_index[3] = in_index[P4];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
new file mode 100644
index 0000000000..10875293a9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#if defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+#endif // defined(ACTIVATION_TYPE)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
+ * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_float(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const float scale)
+{
+    // Get pixels pointer
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr      = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_ACC_TYPE);
+
+    // Perform multiplication
+#ifdef DATA_TYPE_FLOAT
+    VEC_OUT_TYPE res0 = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_OUT_TYPE);
+#else  /* DATA_TYPE_FLOAT */
+    VEC_OUT_TYPE res0 = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((CONVERT(in1_data * in2_data, VEC_FLOAT) * scale), VEC_ACC_TYPE, ROUND), VEC_OUT_TYPE, ROUND);
+#endif /* DATA_TYPE_FLOAT */
+
+#if defined(ACTIVATION_TYPE)
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
+
+#if defined(DATA_TYPE)
+
+/** Performs a pixelwise multiplication of complex float values
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pixelwise_mul_complex(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    vin1 = vload2(0, (__global DATA_TYPE *)in1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    vin2 = vload2(0, (__global DATA_TYPE *)in2.ptr);
+
+    // Perform complex multiplication
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
+
+#if defined(ACTIVATION_TYPE)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
+#else  // defined(ACTIVATION_TYPE)
+    // Store result
+    vstore2(res, 0, (__global DATA_TYPE *)out.ptr);
+#endif // defined(ACTIVATION_TYPE)
+}
+
+#endif // defined(DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
new file mode 100644
index 0000000000..6d1c2d0c79
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SATURATE)
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
+#else // SATURATE
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
+#endif // SATURATE
+#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
+
+#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
+ * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32.
+ */
+__kernel void pixelwise_mul_int(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const uint scale)
+{
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr            = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
+    // Perform multiplication and store result
+    VEC_OUT_TYPE out_data0 = MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
+    STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
+
+#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with float scale of quantized inputs.
+ *
+ * @note The quantization offset of the first operand must be passed at compile time only if asymmetric using -DOFFSET_IN1, e.g. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time only if asymmetric using -DOFFSET_IN2, e.g. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time only if asymmetric using -DOFFSET_OUT, e.g. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, e.g. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, e.g. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, e.g. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_quantized(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const float scale)
+{
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr            = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
+
+    // Dequantize
+#if defined(OFFSET_IN1)
+    in_a -= (VEC_INT)((int)OFFSET_IN1);
+#endif // defined(OFFSET_IN1)
+#if defined(OFFSET_IN2)
+    in_b -= (VEC_INT)((int)OFFSET_IN2);
+#endif // defined(OFFSET_IN2)
+    const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+#if defined(OFFSET_OUT)
+    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+#else  // defined(OFFSET_OUT)
+    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
+#endif // defined(OFFSET_OUT)
+    const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+
+    // Store result
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT) */
diff --git a/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
new file mode 100644
index 0000000000..4494dd8cec
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if VEC_SIZE == 2
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 2)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 2) sum) \
+    {                                                                                  \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#elif VEC_SIZE == 4
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 4)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 4) sum) \
+    {                                                                                  \
+        sum.s01 += sum.s23;                                                            \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#elif VEC_SIZE == 8
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 8)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 8) sum) \
+    {                                                                                  \
+        sum.s0123 += sum.s4567;                                                        \
+        sum.s01 += sum.s23;                                                            \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#else /* VEC_SIZE DEFAULT */
+#define VEC_SIZE 16
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 16)
+#define PERFORM_REDUCTION_IMPL(type)                                                    \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 16) sum) \
+    {                                                                                   \
+        sum.s01234567 += sum.s89abcdef;                                                 \
+        sum.s0123 += sum.s4567;                                                         \
+        sum.s01 += sum.s23;                                                             \
+        sum.s0 += sum.s1;                                                               \
+        return sum.s0;                                                                  \
+    }
+#endif /* VEC_SIZE END */
+
+#define PERFORM_REDUCTION_STR(input, type) perform_reduction_##type(input)
+#define PERFORM_REDUCTION(input, type) PERFORM_REDUCTION_STR(input, type)
+
+PERFORM_REDUCTION_IMPL(int)
+PERFORM_REDUCTION_IMPL(long)
+
+/** Compute quantized multiplier and shift for the inverse square root of input.
+ *  Using 3-bit fixed point and 5 iteration of Newton-Raphson method.
+ *
+ * @param[in] in            Input to use
+ * @param[in] reverse_shift -1 to reverse the shift direction
+ *
+ * @return:
+ *             .s0  Quantized multiplier for inverse square root
+ *             .s1  Shift for inverse square root
+ *
+ */
+inline int2 get_invsqrt_quantized_multiplier_exp(int in, int reverse_shift)
+{
+    int2 stddev_inv;
+    int  stddev_inv_multiplier = INT_MAX;
+    int  stddev_inv_shift      = 0;
+    int  input                 = in;
+    if(input <= 1)
+    {
+        stddev_inv.s0 = stddev_inv_multiplier;
+        stddev_inv.s1 = stddev_inv_shift;
+        return stddev_inv;
+    }
+
+    stddev_inv_shift = 11;
+    while(input >= (1 << 29))
+    {
+        input /= 4;
+        ++stddev_inv_shift;
+    }
+
+    const unsigned int max_left_shift_bits       = clz(input) - 1;
+    const unsigned int max_left_shift_bits_pairs = max_left_shift_bits / 2;
+    const unsigned int left_shift_bit_pairs      = max_left_shift_bits_pairs - 1;
+    stddev_inv_shift -= left_shift_bit_pairs;
+    input <<= 2 * left_shift_bit_pairs;
+
+    typedef int               FixedPointRawType;
+    const unsigned int        fixedpoint_position     = 3;
+    const unsigned int        fixedpoint_int_position = sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
+    typedef FixedPointRawType FixedPoint3;
+    typedef FixedPointRawType FixedPoint0;
+
+    const FixedPoint3 fixedpoint_input      = (input >> 1);
+    const FixedPoint3 fixedpoint_half_input = ASYMM_ROUNDING_DIVIDE_BY_POW2(fixedpoint_input, 1, 1);
+    const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
+    FixedPoint3       x                     = 0x1 << fixedpoint_int_position;
+
+    const int num_iteration = 5;
+    for(int i = 0; i < num_iteration; i++)
+    {
+        int x3 = ASYMM_RESCALE(ASYMM_MULT(ASYMM_MULT(x, x, 1), x, 1), 9, fixedpoint_position, 1);
+        x      = ASYMM_RESCALE(ASYMM_MULT(fixedpoint_half_three, x, 1) - ASYMM_MULT(fixedpoint_half_input, x3, 1), 6, fixedpoint_position, 1);
+    }
+    const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
+    x                                        = ASYMM_MULT(fixedpoint_half_sqrt_2, x, 1);
+    stddev_inv_multiplier                    = x;
+    if(stddev_inv_shift < 0)
+    {
+        stddev_inv_multiplier <<= -stddev_inv_shift;
+        stddev_inv_shift = 0;
+    }
+    stddev_inv_shift *= reverse_shift;
+
+    stddev_inv.s0 = stddev_inv_multiplier;
+    stddev_inv.s1 = stddev_inv_shift;
+    return stddev_inv;
+}
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+/** This function implements QLSTM layer normalization.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QSYMM16
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  weight_ptr                           Pointer to the weight tensor. Supported data type: same as @p input_ptr
+ * @param[in]  weight_stride_x                      Stride of the weight tensor in X dimension (in bytes)
+ * @param[in]  weight_step_x                        weight_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weight_offset_first_element_in_bytes The offset of the first element in the weight tensor
+ * @param[in]  bias_ptr                             Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bias_stride_x                        Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                          bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes   The offset of the first element in the biases tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void qlstm_layer_normalization(
+    IMAGE_DECLARATION(input),
+    VECTOR_DECLARATION(weight),
+    VECTOR_DECLARATION(bias),
+    IMAGE_DECLARATION(output))
+{
+    // Get pixels pointer
+    Image  input  = CONVERT_TO_IMAGE_STRUCT(input);
+    Vector weight = CONVERT_TO_VECTOR_STRUCT(weight);
+    Vector bias   = CONVERT_TO_VECTOR_STRUCT(bias);
+    Image  output = CONVERT_TO_IMAGE_STRUCT(output);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    sum = 0;
+    VEC_DATA_TYPE(long, VEC_SIZE)
+    sum_sq = 0;
+    // Calculate partial sum
+    int i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        // Load data
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
+
+        sum += CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE));
+        sum_sq += CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE)) * CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE));
+    }
+    // Perform reduction
+    sum.s0    = PERFORM_REDUCTION(sum, int);
+    sum_sq.s0 = PERFORM_REDUCTION(sum_sq, long);
+
+    // Left-overs loop
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
+
+        sum.s0 += CONVERT(data, int);
+        sum_sq.s0 += CONVERT(data, long) * CONVERT(data, long);
+    }
+
+    int  temp       = 0x100000 / WIDTH;
+    int  mean       = (int)(sum.s0 * 1024 / WIDTH);
+    int  var2       = ((sum_sq.s0 * (long)temp) - ((long)mean * (long)mean)) / 0x100000;
+    int2 stddev_inv = get_invsqrt_quantized_multiplier_exp(var2, -1);
+
+    i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
+        VEC_DATA_TYPE(int, VEC_SIZE)
+        res = CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE)) * 1024 - mean;
+        res = multiply_by_quantized_multiplier(res, stddev_inv.s0, stddev_inv.s1);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        w   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)vector_offset(&weight, i));
+        res = res * CONVERT(w, VEC_DATA_TYPE(int, VEC_SIZE));
+        res = res + VLOAD(VEC_SIZE)(0, (__global int *)vector_offset(&bias, i));
+        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
+        res = (res + 512) >> 10;
+        res = multiply_by_quantized_multiplier(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12);
+#if defined(MIN_BOUND)
+        res = max(res, (VEC_DATA_TYPE(int, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+        res = min(res, (VEC_DATA_TYPE(int, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+        VSTORE(VEC_SIZE)
+        (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)offset(&output, i, 0));
+    }
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
+        int res        = (int)data * 1024 - mean;
+        res            = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, stddev_inv.s0, stddev_inv.s1, 1);
+        DATA_TYPE w    = *((__global DATA_TYPE *)vector_offset(&weight, i));
+        res            = res * (int)w;
+        int b          = *((__global int *)vector_offset(&bias, i));
+        res            = res + b;
+        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
+        res = (res + 512) >> 10;
+        res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12, 1);
+#if defined(MIN_BOUND)
+        res = max(res, MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+        res = min(res, MAX_BOUND);
+#endif // defined(MAX_BOUND)
+        *((__global DATA_TYPE *)offset(&output, i, 0)) = (DATA_TYPE)res;
+    }
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl
new file mode 100644
index 0000000000..69cc288c25
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/quantization_layer.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
+
+/** This performs the quantization of floating point inputs or 8-bit quantized integers to 8-bit integers.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g. -DDATA_TYPE=short
+ * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g. -DSCALE=0.125
+ * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g. -DOFFSET=125
+ * @note Minimum value for quantized type should be given as a preprocessor argument using -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
+ * @note Maximum value for quantized type should be given as a preprocessor argument using -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
+ * @note If the input data type if a floating point (F16 or F32) the preprocessor argument should be give as -DIS_FLOAT
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void quantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+#if defined(IS_FLOAT)
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    val_float = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset         = OFFSET;
+#else  // defined(IS_FLOAT)
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    val_float = CONVERT(val, VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
+#endif // defined(IS_FLOAT)
+
+    // Quantize
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+#else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, MIN_QUANT_VAL, MAX_QUANT_VAL);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
diff --git a/src/core/CL/cl_kernels/common/range.cl b/src/core/CL/cl_kernels/common/range.cl
new file mode 100644
index 0000000000..d25d10e207
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/range.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
+
+#if !defined(OFFSET_OUT) && !defined(SCALE_OUT)
+
+#if VECTOR_SIZE == 2
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 2))(0, STEP))
+#elif VECTOR_SIZE == 3
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 3))(0, STEP, 2 * STEP))
+#elif VECTOR_SIZE == 4
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 4))(0, STEP, 2 * STEP, 3 * STEP))
+#elif VECTOR_SIZE == 8
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
+#elif VECTOR_SIZE == 16
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
+#endif // VECTOR_SIZE == 2
+
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=value. e.g. -DDATA_TYPE=4
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ *
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range(
+    VECTOR_DECLARATION(out))
+{
+    uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+    __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+    DATA_TYPE seq;
+    seq = (DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP;
+
+    *(__global DATA_TYPE *)dst_ptr = seq;
+#else  // VECTOR_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    seq0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
+    seq0 = seq0 + STEP_VEC;
+    STORE_VECTOR_SELECT(seq, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif //VECTOR_SIZE == 1
+}
+
+#else // !defined(OFFSET_OUT) && !defined(SCALE_OUT)
+
+#if VECTOR_SIZE == 2
+#define STEP_VEC ((VEC_DATA_TYPE(float, 2))(0, STEP))
+#elif VECTOR_SIZE == 3
+#define STEP_VEC ((VEC_DATA_TYPE(float, 3))(0, STEP, 2 * STEP))
+#elif VECTOR_SIZE == 4
+#define STEP_VEC ((VEC_DATA_TYPE(float, 4))(0, STEP, 2 * STEP, 3 * STEP))
+#elif VECTOR_SIZE == 8
+#define STEP_VEC ((VEC_DATA_TYPE(float, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
+#elif VECTOR_SIZE == 16
+#define STEP_VEC ((VEC_DATA_TYPE(float, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
+#endif // VECTOR_SIZE == 2
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=vector_size. e.g. -DDATA_TYPE=4
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ *
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8.
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range_quantized(
+    VECTOR_DECLARATION(out))
+{
+    uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+    __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+    float           seq;
+    seq                            = (float)START + (float)id * (float)STEP;
+    seq                            = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
+    seq                            = max(0.0f, min(seq, 255.0f));
+    *(__global DATA_TYPE *)dst_ptr = CONVERT_SAT(CONVERT_DOWN(seq, int), DATA_TYPE);
+#else  // VECTOR_SIZE == 1
+    VEC_DATA_TYPE(float, VECTOR_SIZE)
+    seq = (float)START + id * (float)STEP;
+    seq = seq + STEP_VEC;
+    seq = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
+    seq = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    res0 = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif // VECTOR_SIZE == 1
+}
+#endif // !defined(OFFSET_OUT) && !defined(SCALE_OUT)
+
+#endif // defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
diff --git a/src/core/CL/cl_kernels/common/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
new file mode 100644
index 0000000000..99369be19a
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isgreater(x, y))
+#define ISLESS(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isless(x, y))
+#define ISGREATER_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isgreater(x, y))
+#define ISLESS_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isless(x, y))
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#define ISGREATER_SCALAR ISGREATER
+#define ISLESS_SCALAR ISLESS
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x > y)
+#define ISLESS(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x < y)
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(WIDTH)
+#if defined(OPERATION)
+
+#define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
+#define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
+#define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
+#define min_(in0, in1, size) (min(in0, MIN_REDUCE(in1, size)))
+#define max_(in0, in1, size) (max(in0, MAX_REDUCE(in1, size)))
+
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
+ * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input
+ * @param[in] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reduction_operation_x(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
+
+#if !defined(MIN) && !defined(MAX)
+#if defined(PROD)
+    DATA_TYPE res = (DATA_TYPE)1;
+#else  // defined(PROD)
+    DATA_TYPE res = (DATA_TYPE)0;
+#endif // defined(PROD)
+#else  // #if !defined(MIN) && !defined(MAX)
+    DATA_TYPE res = *((__global DATA_TYPE *)input_addr);
+#endif // #if defined(MIN) || defined(MAX)
+    int x = 0;
+
+    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
+        res  = OPERATION(res, vals, VEC_SIZE);
+    }
+
+#if(WIDTH % VEC_SIZE)
+    _Pragma("unroll") for(; x < WIDTH; ++x)
+    {
+        DATA_TYPE val = *((__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
+        res           = OPERATION(res, val, 1);
+    }
+#endif // (WIDTH % VEC_SIZE)
+
+#if defined(MEAN)
+    res /= WIDTH;
+#endif // defined(MEAN)
+    *((__global DATA_TYPE *)output_addr) = res;
+}
+#endif // defined(OPERATION)
+/** This kernel performs reduction on x-axis. (Non parallel)
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_non_parallel_x(
+    VECTOR_DECLARATION(input),
+    VECTOR_DECLARATION(output))
+{
+    Vector input  = CONVERT_TO_VECTOR_STRUCT(input);
+    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+
+    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, 0)), DATA_TYPE_PROMOTED);
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    for(unsigned int x = 1; x < WIDTH; ++x)
+    {
+        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, x)), DATA_TYPE_PROMOTED);
+#if defined(MIN)
+        res = select(res, in, ISLESS_SCALAR(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER_SCALAR(in, res));
+#elif defined(PROD)
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+#else  // defined(SUM))
+        res += in;
+#endif // defined(MAX) || defined(MIN) || defined(PROD)
+    }
+
+    // Store result
+#if defined(MEAN)
+    res /= WIDTH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (WIDTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_y(
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_z,
+    uint            output_offset_first_element_in_bytes)
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int z = get_global_id(1);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + z * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int y = 1; y < HEIGHT; ++y)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif // defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= HEIGHT;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (HEIGHT - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_z(
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_w,
+    uint            output_offset_first_element_in_bytes)
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int w = get_global_id(2);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + w * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + w * output_stride_w;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int z = 1; z < DEPTH; ++z)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif // defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= DEPTH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (DEPTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_stride_v                       Stride of the source tensor in V dimension (in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_stride_v                      Stride of the output tensor in V dimension (in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_w(
+    __global uchar *input_ptr,
+    uint            input_stride_y,
+    uint            input_stride_z,
+    uint            input_stride_w,
+    uint            input_stride_v,
+    uint            input_offset_first_element_in_bytes,
+
+    __global uchar *output_ptr,
+    uint            output_stride_y,
+    uint            output_stride_z,
+    uint            output_stride_v,
+    uint            output_offset_first_element_in_bytes)
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+
+    int gid_2 = get_global_id(2);
+    int z = get_global_id(2) % DEPTH;
+    int v = get_global_id(2) / DEPTH;
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z + v * input_stride_v;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z + v * output_stride_v;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int w = 1; w < BATCH; ++w)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif //defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= BATCH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (BATCH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/common/reshape_layer.cl b/src/core/CL/cl_kernels/common/reshape_layer.cl
new file mode 100644
index 0000000000..c47664bf85
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reshape_layer.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_shape                          Input spatial shape
+ * @param[in]  output_shape                         Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input),
+                            TENSOR3D_DECLARATION(output),
+                            int2 input_shape,
+                            int2 output_shape)
+{
+    int out_x = get_global_id(0);
+    int out_y = get_global_id(1);
+    int out_z = get_global_id(2);
+
+    // Compute the output linearized index
+    int out_linear_idx = out_x + out_y * output_shape.x + out_z * output_shape.x * output_shape.y;
+
+    // Translate to intput
+    int in_x = out_linear_idx % input_shape.x;
+    int in_y = (out_linear_idx / input_shape.x) % input_shape.y;
+    int in_z = out_linear_idx / (input_shape.x * input_shape.y);
+
+    // Store result
+    input_ptr += input_offset_first_element_in_bytes + in_x * input_stride_x + in_y * input_stride_y + in_z * input_stride_z;
+    output_ptr += output_offset_first_element_in_bytes + out_x * output_stride_x + out_y * output_stride_y + out_z * output_stride_z;
+    *((__global DATA_TYPE *)output_ptr) = *((__global DATA_TYPE *)input_ptr);
+}
diff --git a/src/core/CL/cl_kernels/common/reverse.cl b/src/core/CL/cl_kernels/common/reverse.cl
new file mode 100644
index 0000000000..e6df3041c2
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reverse.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
+
+#if NUM_REVERSE_DIMS > 4
+#error("Reversing more than 4 dimensions is not currently supported")
+#endif /* NUM_REVERSE_DIMS > 4 */
+
+/** Performs reverse along the specified axis.
+ *
+ * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
+ * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
+ * @note The number of dimensions of the source tensor must be given as a preprocessor argument using -DRANK=num, e.g. -DRANK=3
+ * @note The values in axis_tensor must be within [-rank, rank-1].
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  axis_ptr                           Pointer to the source vector. Supported data types: U32
+ * @param[in]  axis_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  axis_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  axis_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void reverse(TENSOR4D_DECLARATION(src),
+                      VECTOR_DECLARATION(axis),
+                      TENSOR4D_DECLARATION(dst),
+                      const uint width,
+                      const uint height,
+                      const uint depth,
+                      const uint batches)
+{
+    Tensor4D src  = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
+    Vector   axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
+    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst);
+
+    const uint x_in = get_global_id(0);
+    const uint y_in = get_global_id(1);
+    const uint z_in = get_global_id(2) % depth;
+    const uint w_in = get_global_id(2) / depth;
+
+    const uint4 dims       = (uint4)(0, 1, 2, 3);
+    int4        to_reverse = (int4)(0, 0, 0, 0);
+
+    VEC_DATA_TYPE(int, NUM_REVERSE_DIMS) indices =  VLOAD(NUM_REVERSE_DIMS)(0,(__global int *)axis.ptr);
+#if defined(USE_INVERTED_AXIS)
+    indices    = select((VEC_DATA_TYPE(int, NUM_REVERSE_DIMS)) RANK - 1, -1, indices < 0) - indices;
+#else /* defined(USE_INVERTED_AXIS) */
+    indices    = select(indices, indices + RANK, indices < 0);
+#endif /* defined(USE_INVERTED_AXIS) */
+
+#if NUM_REVERSE_DIMS == 1
+    to_reverse = ((uint4)indices == dims);
+#elif NUM_REVERSE_DIMS == 2
+    to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
+#elif NUM_REVERSE_DIMS == 3
+    to_reverse = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims);
+#else /* NUM_REVERSE_DIMS == 1 */
+    to_reverse    = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
+#endif /* NUM_REVERSE_DIMS == 1 */
+
+    const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
+    const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
+    const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
+    const uint w_out = to_reverse.s3 ? batches - w_in - 1 : w_in;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&dst, x_out, y_out, z_out, w_out)) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
diff --git a/src/core/CL/cl_kernels/common/roi_align_layer.cl b/src/core/CL/cl_kernels/common/roi_align_layer.cl
new file mode 100644
index 0000000000..8cfe5ddcb6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/roi_align_layer.cl
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
+            sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+        }
+    }
+
+    return (DATA_TYPE)(sum / (grid_size_x * grid_size_y));
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi                 = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
+    const float2 roi_dims   = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    const float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2       region_start = spatial_indx * bin_size + roi_anchor;
+    float2       region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    const float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    const float2 roi_bin_grid           = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
new file mode 100644
index 0000000000..e75dee06f6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) && defined(OFFSET_IN) && defined(OFFSET_OUT) && defined(SCALE_IN) && defined(SCALE_OUT) && defined(OFFSET_ROIS) && defined(SCALE_ROIS) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
+
+            const float data1_f32 = DEQUANTIZE(data1, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data2_f32 = DEQUANTIZE(data2, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data3_f32 = DEQUANTIZE(data3, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data4_f32 = DEQUANTIZE(data4, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            sum += w1 * data1_f32 + w2 * data2_f32 + w3 * data3_f32 + w4 * data4_f32;
+        }
+    }
+
+    const float res_f32 = sum / (grid_size_x * grid_size_y);
+    return QUANTIZE(res_f32, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=uchar
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }.
+ *                                                  Supported data types: QASYMM16 with 0.125f scale and 0 offset
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer_quantized(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = *((__global ushort *)offset(&rois, 0, pw));
+    float4 roi             = DEQUANTIZE(vload4(0, (__global ushort *)offset(&rois, 1, pw)), OFFSET_ROIS, SCALE_ROIS, ushort, 4);
+    float2 roi_anchor      = roi.s01 * convert_float(SPATIAL_SCALE);
+    float2 roi_dims        = fmax((roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    float2 spatial_indx     = (float2)(px, py);
+    float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2 region_start = spatial_indx * bin_size + roi_anchor;
+    float2 region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    float2       roi_bin_grid           = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/roi_pooling_layer.cl b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
new file mode 100644
index 0000000000..6899b952e0
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+#if DATA_SIZE == 32
+#define VEC_SIZE 4
+#define VEC_MAX vec4_max
+#elif DATA_SIZE == 16
+#define VEC_SIZE 8
+#define VEC_MAX vec8_max
+#elif DATA_SIZE == 8
+#define VEC_SIZE 16
+#define VEC_MAX vec16_max
+#else /* DATA_SIZE not equals 8, 16, 32 */
+#error "Unsupported data size"
+#endif /* DATA_SIZE == 32 */
+
+// Define whether to use max (Quantized datatype) or fmax (Float) functions
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+#define MAX(x, y) max(x, y)
+#else // !(defined(OFFSET_OUT) && defined(SCALE_OUT)
+#define MAX(x, y) fmax(x, y)
+#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
+
+inline DATA_TYPE vec4_max(VEC_DATA_TYPE(DATA_TYPE, 4) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp = MAX(vec.lo, vec.hi);
+    return MAX(temp.x, temp.y);
+}
+
+inline DATA_TYPE vec8_max(VEC_DATA_TYPE(DATA_TYPE, 8) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp = MAX(vec.lo, vec.hi);
+    return vec4_max(temp);
+}
+
+inline DATA_TYPE vec16_max(VEC_DATA_TYPE(DATA_TYPE, 16) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp = MAX(vec.lo, vec.hi);
+    return vec8_max(temp);
+}
+
+/** Performs a roi pooling on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return A max pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int region_end_x, int region_start_y, int region_end_y, int pz)
+{
+    // Iterate through the pooling region
+    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return (DATA_TYPE)0;
+    }
+    else
+    {
+        int num_iter = (int)((region_end_x - region_start_x) / VEC_SIZE);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(MIN_VALUE);
+
+        for(int j = region_start_y; j < region_end_y; ++j)
+        {
+            int i = region_start_x;
+            for(; i < region_start_x + num_iter * VEC_SIZE; i += VEC_SIZE)
+            {
+                VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+                val      = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(input, i, j, pz));
+                curr_max = MAX(val, curr_max);
+            }
+            for(; i < region_end_x; ++i)
+            {
+                DATA_TYPE val = *(__global DATA_TYPE *)tensor3D_offset(input, i, j, pz);
+                curr_max      = MAX(curr_max, val);
+            }
+        }
+
+        const DATA_TYPE temp = (DATA_TYPE)VEC_MAX(curr_max);
+
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+        return QUANTIZE(temp, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
+#endif /* if quantized, requantize and return */
+
+        return temp;
+    }
+}
+
+/** Performs a roi pooling function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32, QASYMM8;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32, QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source image as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as input
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_stride_w                       Stride of the source image in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination image in W dimension (in bytes)
+ */
+__kernel void roi_pooling_layer(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pw = get_global_id(2);
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global ushort *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(ushort, 4)
+    roi               = vload4(0, (__global ushort *)offset(&rois, 1, pw));
+    const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+    const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const int2   max_spatial_dims = (int2)(MAX_DIM_X, MAX_DIM_Y);
+    int2         region_start     = convert_int2_sat(floor(spatial_indx / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+    int2         region_end       = convert_int2_sat(floor((spatial_indx + 1) / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+        *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_pool_1x1(&input,
+                                                                                                       region_start.x,
+                                                                                                       region_end.x,
+                                                                                                       region_start.y,
+                                                                                                       region_end.y, pz);
+    }
+}
diff --git a/src/core/CL/cl_kernels/common/scatter.cl b/src/core/CL/cl_kernels/common/scatter.cl
new file mode 100644
index 0000000000..e3ec9cc98e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/scatter.cl
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+// The below defines the various reduce operations for our purposes.
+// Where a corresponds to the existing value, and b the new value.
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+
+#ifdef IS_FLOAT
+#define MAX_OP(a, b) fmax(a, b)
+#define MIN_OP(a, b) fmin(a, b)
+#else // ifdef IS_FLOAT
+#define MAX_OP(a, b) max(a, b)
+#define MIN_OP(a, b) min(a, b)
+#endif // ifdef IS_FLOAT
+
+#define UPDATE_OP(a, b) (b)
+
+#ifdef SCATTER_MP1D_2D_MPND
+
+/** This kernel performs scatter operation
+ *
+ * @note Datatype should be given as a compile-time argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Number of indices should be given as a compile-time argument using -DNUM_INDICES, e.g. -DNUM_INDICES=3
+ * @note Index length should be given as a compile-time argument using -DINDEX_LENGTH, e.g. -DINDEX_LENGTH=2
+ * @note Outermost output shapes should be given as a compile-time argument using -DOUT_SHAPE_N_MINUS_X, where
+ *       X must be 1,2,3,4,5, e.g. -DOUT_SHAPE_N_MINUS_1=3, ...
+ * @note Number of elements to copy in a row should be given as a compile-time argument using -DN0, e.g. -DN0=4
+ * @note Number of partial elements at the edge to copy in a row should be given as a compile-time argument using
+ *       -DPARTIAL_N0, e.g. -DPARTIAL_N0=2
+ * @note Scatter function should be given as a compile-time argument using -DSCATTER_FUNCTION, e.g. -DSCATTER_FUNCTION=ADD
+ * @note If the kernel should skip reading the output tensor, -DSKIP_OUTPUT_READ option should be provided.
+ * @note Kernel name in uppercase letters should be provided as a compile-time argument, e.g. -DSCATTER_MP1D_2D_MPND
+ *
+ * @param[in]  updates_ptr                           Pointer to the updates tensor. Data Types: F32
+ * @param[in]  updates_stride_x                      Stride of the updates tensor in X dimension (in bytes)
+ * @param[in]  updates_step_x                        updates_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  updates_stride_y                      Stride of the updates tensor in Y dimension (in bytes)
+ * @param[in]  updates_step_y                        updates_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  updates_offset_first_element_in_bytes The offset of the first element in the updates tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Data Types: S32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Same as @p upt_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  upt_block_stride                      Update tensor data block stride in bytes
+ * @param[in]  out_block_stride                      Output tensor data block stride in bytes
+ */
+__kernel void scatter_mp1d_2d_mpnd(
+    IMAGE_DECLARATION(updates),
+    IMAGE_DECLARATION(indices),
+    IMAGE_DECLARATION(output),
+    int upt_block_stride,
+    int out_block_stride
+    )
+{
+    const int out_shape[5] = {OUT_SHAPE_N_MINUS_1, OUT_SHAPE_N_MINUS_2, OUT_SHAPE_N_MINUS_3,
+        OUT_SHAPE_N_MINUS_4, OUT_SHAPE_N_MINUS_5};
+
+    const int x = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // x-coordinate in the tensor
+    const int y = get_global_id(1); // collapsed y-coordinate (ignoring the outermost dimensions)
+
+    const bool x_cond = (PARTIAL_N0 != 0 && get_global_id(0) == 0);
+
+    uchar *ind_ptr_raw = indices_ptr + indices_offset_first_element_in_bytes;
+    const uchar *out_ptr_raw =  output_ptr + output_offset_first_element_in_bytes
+        + x * sizeof(DATA_TYPE) + y * output_stride_y;
+
+    const uchar *upt_ptr_raw = updates_ptr + updates_offset_first_element_in_bytes
+        + x * sizeof(DATA_TYPE) + y * updates_stride_y;
+
+    for(int index_element = 0; index_element < NUM_INDICES; ++index_element)
+    {
+        const int *ind_ptr = (const int *) (ind_ptr_raw);
+
+        // Out of bounds check
+        bool out_of_bounds = false;
+        LOOP_UNROLLING(int, i, 0, 1, INDEX_LENGTH,
+        {
+            if(ind_ptr[i] >= out_shape[i] || ind_ptr[i] < 0)
+            {
+                out_of_bounds = true;
+            }
+        });
+
+        ind_ptr_raw += indices_stride_y;
+
+        if(out_of_bounds)
+        {
+            continue;
+        }
+
+        // Index calculation
+        int index = 0;
+        LOOP_UNROLLING(int, i, 0, 1, INDEX_LENGTH,
+        {
+            index = index * out_shape[i] + ind_ptr[i];
+        });
+
+        DATA_TYPE *out_ptr = (DATA_TYPE *) (out_ptr_raw + index * out_block_stride);
+
+        const DATA_TYPE *upt_ptr = (const DATA_TYPE *) (upt_ptr_raw + index_element * upt_block_stride);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0) data_in0 = VLOAD(N0)(0, (__global DATA_TYPE *) upt_ptr);
+
+#ifdef SKIP_OUTPUT_READ
+        STORE_VECTOR_SELECT(data_in, DATA_TYPE, (__global DATA_TYPE *) out_ptr, N0, PARTIAL_N0, x_cond);
+#else // ifdef SKIP_OUTPUT_READ
+        VEC_DATA_TYPE(DATA_TYPE, N0) data_out0 = VLOAD(N0)(0, (__global DATA_TYPE *) out_ptr);
+        data_out0 = SCATTER_FUNCTION(data_out0, data_in0);
+
+        STORE_VECTOR_SELECT(data_out, DATA_TYPE, (__global DATA_TYPE *) out_ptr, N0, PARTIAL_N0, x_cond);
+#endif // ifdef SKIP_OUTPUT_READ
+    }
+}
+
+#endif // SCATTER_MP1D_2D_MPND
+
+#ifdef SCATTER1D_PARALLEL
+
+// NOTE : This code is non-deterministic and can only be excecuted with the "update" ScatterFunction
+// This code is currently unusued as it requires changes to the existing test suite.
+/** Performs the Scatter1D operation with multiple threads.
+ *  Similar to @ref scatter1D()
+ */
+__kernel void scatter1D_parallel(
+    TENSOR4D_DECLARATION(updates),
+    TENSOR4D_DECLARATION(indices),
+    TENSOR4D_DECLARATION(output))
+{
+    // Currently 1D - only iterate through x dimension of indices.
+    const int px = get_global_id(0);
+    const int index_value = *(uchar*)(indices_ptr + indices_offset_first_element_in_bytes + (sizeof(int) * px));
+
+    if(index_value < OUT_SHAPE_X)
+    {
+        const DATA_TYPE update = *(DATA_TYPE *)(updates_ptr + updates_offset_first_element_in_bytes + (sizeof(DATA_TYPE) * px));
+        __global uchar *out_addr = output_ptr + indices_offset_first_element_in_bytes + (sizeof(DATA_TYPE) * index_value);
+        *(__global DATA_TYPE *)(out_addr) = update;
+    }
+}
+
+#endif // SCATTER1D_PARALLEL
diff --git a/src/core/CL/cl_kernels/common/select.cl b/src/core/CL/cl_kernels/common/select.cl
new file mode 100644
index 0000000000..6fd4bd4ce3
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/select.cl
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** This function perform a select operation between two tensors when condition tensor has the same rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  c_step_y                          c_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  c_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  c_step_z                          c_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_same_rank(
+    TENSOR3D_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes + offset + get_global_id(1) * c_step_y + get_global_id(2) * c_step_z;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+
+    // Load values
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_c = CONVERT(VLOAD(VEC_SIZE)(0, c_addr), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
+
+    // Calculate result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_2(
+    VECTOR_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    const int c_idx = get_global_id(1);
+
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+
+    // Load values
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_c = *((__global uchar *)(c_addr + c_idx * c_stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
+
+    // Calculate result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) */
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_n(
+    VECTOR_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    const int c_idx = get_global_id(2) / DEPTH_SIZE;
+
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+
+    // Load values
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_c = *((__global uchar *)(c_addr + c_idx * c_stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
+
+    // Calculate result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) && defined(VEC_SIZE_LEFTOVER) */
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/slice_ops.cl b/src/core/CL/cl_kernels/common/slice_ops.cl
new file mode 100644
index 0000000000..189d414aba
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/slice_ops.cl
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018-2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform a strided slice operation on a given input.
+ *
+ * @attention Supported tensor rank: up to 4
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Output tensor depht should be given as a preprocessor arguments using -DDST_DEPTH=size
+ * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
+ * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void strided_slice(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+
+    int offset = 0;
+
+    // Offset X
+#if defined(SHRINK_0)
+    input.ptr += (int)START_0 * input_stride_x;
+#elif defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    offset       = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
+    input.ptr += offset * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+#elif defined(START_0) && defined(STRIDE_0)
+    offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
+    input.ptr += offset * input_stride_x;
+#endif // defined(START_0) && defined(STRIDE_0)
+
+    // Offset Y
+#if defined(SHRINK_1)
+    input.ptr += (int)START_1 * input_stride_y;
+#elif defined(START_1) && defined(STRIDE_1)
+#if defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(0) * (int)STRIDE_1;
+#else  // defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+#endif // defined(SHRINK_0)
+    input.ptr += offset * input_stride_y;
+#endif // defined(START_1) && defined(STRIDE_1)
+
+    // Offset Z
+#if defined(SHRINK_2)
+    input.ptr += (int)START_2 * input_stride_z;
+#elif defined(START_2) && defined(STRIDE_2)
+
+#if defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(0) * (int)STRIDE_2;
+#elif defined(SHRINK_1) || defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(1) * (int)STRIDE_2;
+#else  // defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+#endif // defined(SHRINK_1) && defined(SHRINK_0)
+
+    input.ptr += offset * input_stride_z;
+#endif // defined(START_2) && defined(STRIDE_2)
+
+    // Offset depth
+#if defined(SHRINK_3)
+    input.ptr += (int)START_3 * input_stride_w;
+#elif defined(START_3) && defined(STRIDE_3)
+#if defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + (int)get_global_id(0) * (int)STRIDE_3;
+#elif !defined(SHRINK_2) && !defined(SHRINK_1) && !defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+#elif(defined(SHRINK_0) && defined(SHRINK_1)) || (defined(SHRINK_1) && defined(SHRINK_2)) || (defined(SHRINK_0) && defined(SHRINK_2))
+    offset = (int)START_3 + (int)get_global_id(1) * (int)STRIDE_3;
+#else  // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_3;
+#endif // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    input.ptr += offset * input_stride_w;
+#endif // defined(START_3) && defined(STRIDE_3)
+
+    // Store result
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
+
+    VSTORE(VEC_SIZE)
+    (val, 0, (__global DATA_TYPE *)(output.ptr));
+#else  // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
new file mode 100644
index 0000000000..bfc0995bb8
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -0,0 +1,371 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#define MIN_VALUE_float -FLT_MAX
+#define MIN_VALUE_half  -HALF_MAX
+#define MIN_VALUE_char  CHAR_MIN
+#define MIN_VALUE_uchar 0
+
+#define MIN_VALUE_TYPE_STR(data_type) MIN_VALUE_##data_type
+#define MIN_VALUE_TYPE(data_type) MIN_VALUE_TYPE_STR(data_type)
+#define MIN_VALUE MIN_VALUE_TYPE(DATA_TYPE)
+
+#ifdef SOFTMAX_X
+
+/** 3-pass softmax in the x dimension.
+ *
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
+ *
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
+ */
+__kernel void softmax_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element
+
+#ifdef IS_QUANTIZED
+    ,
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element
+#endif // IS_QUANTIZED
+)
+{
+    const int dim_0 = get_global_id(0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
+
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+
+#ifdef IS_QUANTIZED
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+#else // IS_QUANTIZED
+    __global uchar *tmp_ptr = dst_ptr;
+#endif // IS_QUANTIZED
+
+    // Calculate max value.
+    DATA_TYPE max_value = MIN_VALUE;
+    int i = 0;
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)));
+
+        max_value = max(max_value, MAX_REDUCE(data, VEC_SIZE));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        DATA_TYPE data = *(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE));
+
+        max_value = max(max_value, data);
+    }
+
+    // Regularize the data.
+    TMP_DATA_TYPE sum_value = 0;
+
+#ifdef IS_QUANTIZED
+    TMP_DATA_TYPE max_value_f = (CONVERT(max_value, TMP_DATA_TYPE) - SRC_OFFSET) * SRC_SCALE;
+    TMP_DATA_TYPE regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += SUM_REDUCE(exp(data), VEC_SIZE);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += SUM_REDUCE(data, VEC_SIZE);
+#endif // IS_LOG
+
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        TMP_DATA_TYPE data = CONVERT(*(__global DATA_TYPE *)(src_ptr + i * sizeof(DATA_TYPE)), TMP_DATA_TYPE);
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += data;
+#endif // IS_LOG
+
+        *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)) = data;
+    }
+
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    TMP_DATA_TYPE norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, SIZE), rte)
+# else // IS_LOG
+    TMP_DATA_TYPE norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(SIZE, x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(SIZE, x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH - VEC_SIZE; i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE)));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result = NORMALIZE(VEC_SIZE, data);
+
+        VSTORE(VEC_SIZE)(result, 0, (__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)));
+    }
+
+    for (; i < LENGTH; ++i)
+    {
+        TMP_DATA_TYPE data = *(__global TMP_DATA_TYPE *)(tmp_ptr + i * sizeof(TMP_DATA_TYPE));
+
+        DATA_TYPE result = NORMALIZE(1, data);
+
+        *(__global DATA_TYPE *)(dst_ptr + i * sizeof(DATA_TYPE)) = result;
+    }
+
+#undef NORMALIZE
+}
+
+#endif // SOFTMAX_X
+
+#ifdef SOFTMAX_NON_X
+
+/** 3-pass softmax in any dimension higher than the x dimension.
+ *
+ * List of preprocessors:
+ *   - DATA_TYPE: the input/output data type.
+ *   - TMP_DATA_TYPE: the data type used for computing and temporary tensor storage.
+ *     If DATA_TYPE is quantized, TMP_DATA_TYPE is floating-point, otherwise TMP_DATA_TYPE is the same as DATA_TYPE.
+ *   - IS_LOG (optional): indicating whether this is log softmax.
+ *   - LENGTH: the number of elements in softmax axis in the input/output tensors.
+ *   - BETA: the beta coefficient.
+ *   - IS_QUANTIZED (optional): indicating whether the input/output data type is quantized data.
+ *   - VEC_SIZE: the size of the vector.
+ *   - VEC_SIZE_LEFTOVER: the size of the leftover part.
+ *
+ * Additional preprocessors in case IS_QUANTIZED is present:
+ *   - SRC_SCALE and SRC_OFFSET: the quantization information of the source tensor.
+ *   - DST_SCALE and DST_OFFSET: the quantization information of the destination tensor.
+ *
+ * @param[in] src_ptr                  Pointer to the source tensor.
+ * @param[in] src_stride_0             Stride in bytes of the source tensor in the dimension corresponding to global ID 0.
+ * @param[in] src_stride_1             Stride in bytes of the source tensor in the dimension corresponding to global ID 1.
+ * @param[in] src_stride_2             Stride in bytes of the source tensor in the dimension corresponding to global ID 2.
+ * @param[in] src_offset_first_element Offset of the first element in the source tensor.
+ * @param[in] dst_ptr                  Pointer to the destination tensor.
+ * @param[in] dst_stride_0             Stride in bytes of the destination tensor in the dimension corresponding to global ID 0.
+ * @param[in] dst_stride_1             Stride in bytes of the destination tensor in the dimension corresponding to global ID 1.
+ * @param[in] dst_stride_2             Stride in bytes of the destination tensor in the dimension corresponding to global ID 2.
+ * @param[in] dst_offset_first_element Offset of the first element in the destination tensor.
+ * @param[in] tmp_ptr                  Pointer to the temporary tensor.
+ * @param[in] tmp_stride_0             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 0.
+ * @param[in] tmp_stride_1             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 1.
+ * @param[in] tmp_stride_2             Stride in bytes of the temporary tensor in the dimension corresponding to global ID 2.
+ * @param[in] tmp_offset_first_element Offset of the first element in the temporary tensor.
+ */
+__kernel void softmax_non_x(
+    __global uchar *src_ptr,
+    uint src_stride_0,
+    uint src_stride_1,
+    uint src_stride_2,
+    uint src_offset_first_element,
+
+    __global uchar *dst_ptr,
+    uint dst_stride_0,
+    uint dst_stride_1,
+    uint dst_stride_2,
+    uint dst_offset_first_element,
+
+    __global uchar *tmp_ptr,
+    uint tmp_stride_0,
+    uint tmp_stride_1,
+    uint tmp_stride_2,
+    uint tmp_offset_first_element,
+
+    uint src_stride_axis,
+    uint dst_stride_axis
+)
+{
+    const int dim_0 = max((int)get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE, 0);
+    const int dim_1 = get_global_id(1);
+    const int dim_2 = get_global_id(2);
+
+    src_ptr += src_offset_first_element + dim_2 * src_stride_2 + dim_1 * src_stride_1 + dim_0 * src_stride_0;
+    dst_ptr += dst_offset_first_element + dim_2 * dst_stride_2 + dim_1 * dst_stride_1 + dim_0 * dst_stride_0;
+    tmp_ptr += tmp_offset_first_element + dim_2 * tmp_stride_2 + dim_1 * tmp_stride_1 + dim_0 * tmp_stride_0;
+
+    // In case of processing quantized data, i.e. DATA_TYPE is smaller than TMP_DATA_TYPE:
+    //
+    // In the first pass (finding max), the quantized data is copied from the input tensor to the temporary tensor.
+    // Dequantization is not needed to find the max value and since dequantization widens the data, we defer it
+    // to the second pass pass to reduce memory bandwidth of the first pass.
+    //
+    // In the second pass, it reads the quantized data from the temporary tensor and writes the dequantized data
+    // back to the temporary tensor.
+    //
+    // To avoid dequantized data overwritting the unprocessed quantized data in the temporary tensor,
+    // this extra offset is introduced to store the quantized data at the end of the temporary tensor.
+    //
+    // Note: Another approach is to perform the second pass in reverse order, but for unexplanable reason
+    // it doesn't work in some devices.
+    uint tmp_extra_offset = LENGTH * VEC_SIZE * (sizeof(TMP_DATA_TYPE) - sizeof(DATA_TYPE));
+
+    // Calculate max value and store the input data to the temporary tensor in suitable format.
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) max_value = MIN_VALUE;
+    int i = 0;
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_ptr + i * src_stride_axis));
+
+        max_value = max(max_value, data);
+
+        VSTORE(VEC_SIZE)(data, 0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE)));
+    }
+
+    // Regularize the data.
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) sum_value = 0;
+
+#ifdef IS_QUANTIZED
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) max_value_f = (CONVERT(max_value, VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE)) - SRC_OFFSET) * SRC_SCALE;
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) regularize_offset = -SRC_OFFSET * SRC_SCALE * (TMP_DATA_TYPE)BETA - max_value_f * (TMP_DATA_TYPE)BETA;
+# define REGULARIZE(x) ((x) * SRC_SCALE * (TMP_DATA_TYPE)BETA + regularize_offset)
+#else // IS_QUANTIZED
+# define REGULARIZE(x) (((x) - max_value) * (TMP_DATA_TYPE)BETA)
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(tmp_ptr + tmp_extra_offset + i * VEC_SIZE * sizeof(DATA_TYPE))), VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE));
+
+        data = REGULARIZE(data);
+
+#ifdef IS_LOG
+        sum_value += exp(data);
+#else // IS_LOG
+        data = exp(data);
+        sum_value += data;
+#endif // IS_LOG
+
+        VSTORE(VEC_SIZE)(data, 0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
+    }
+
+#undef REGULARIZE
+
+    // Normalize the data.
+#ifdef IS_QUANTIZED
+# if IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_offset = -log(sum_value) + DST_OFFSET;
+#  define NORMALIZE(x) CONVERT_SAT_ROUND((x) / DST_SCALE + norm_offset, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE), rte)
+# else // IS_LOG
+    VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) norm_div = sum_value * DST_SCALE;
+#  define NORMALIZE(x) CONVERT_SAT(add_sat(CONVERT_SAT_ROUND((x) / norm_div, VEC_DATA_TYPE(int, VEC_SIZE), rte), DST_OFFSET), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))
+#  endif // IS_LOG
+#else // IS_QUANTIZED
+# if IS_LOG
+#  define NORMALIZE(x) ((x) - log(sum_value))
+# else // IS_LOG
+#  define NORMALIZE(x) ((x) / sum_value)
+# endif // IS_LOG
+#endif // IS_QUANTIZED
+
+    for (i = 0; i < LENGTH; ++i)
+    {
+        VEC_DATA_TYPE(TMP_DATA_TYPE, VEC_SIZE) data = VLOAD(VEC_SIZE)(0, (__global TMP_DATA_TYPE *)(tmp_ptr + i * VEC_SIZE * sizeof(TMP_DATA_TYPE)));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) result0 = NORMALIZE(data);
+
+        STORE_VECTOR_SELECT(result, DATA_TYPE, dst_ptr + i * dst_stride_axis, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+    }
+
+#undef NORMALIZE
+}
+
+#endif // SOFTMAX_NON_X
+
+#undef MIN_VALUE
+#undef MIN_VALUE_TYPE
+#undef MIN_VALUE_TYPE_STR
+
+#undef MIN_VALUE_float
+#undef MIN_VALUE_half
+#undef MIN_VALUE_char
+#undef MIN_VALUE_uchar
diff --git a/src/core/CL/cl_kernels/common/stack_layer.cl b/src/core/CL/cl_kernels/common/stack_layer.cl
new file mode 100644
index 0000000000..2468bf750d
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/stack_layer.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
+
+#if AXIS == 0
+#define X_DST (idx_input)
+#define Y_DST (x_src)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 1 // AXIS == 1
+#define X_DST (x_src)
+#define Y_DST (idx_input)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 2 // AXIS == 2
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (idx_input)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 3 // AXIS == 3
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (idx_input)
+#define K_DST (w_src)
+#elif AXIS == 4 // AXIS == 4
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (w_src)
+#define K_DST (idx_input)
+#else // AXIS not supported
+#error "Not supported axis"
+#endif // AXIS == 0
+
+/** OpenCL kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note The dimension to stack the tensors along has to be passed at compile time using -DAXIS. i.e. -DAXIS=1
+ * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
+ * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_input                         Index of the input tensor in the list of tensors to stack
+ */
+__kernel void stack_layer(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    unsigned int idx_input)
+{
+    uint x_src = get_global_id(0);
+    uint y_src = get_global_id(1);
+    uint z_src = (get_global_id(2) % SRC_DIM2);
+    uint w_src = (get_global_id(2) / SRC_DIM2);
+
+    __global DATA_TYPE *src = (__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_src * sizeof(DATA_TYPE) + y_src * src_stride_y + z_src * src_stride_z + w_src * src_stride_w);
+
+    __global DATA_TYPE *dst = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + X_DST * sizeof(DATA_TYPE) + Y_DST * dst_stride_y + Z_DST * dst_stride_z + W_DST * dst_stride_w + K_DST *
+                                                     dst_stride_w * (uint)DST_DIM3);
+
+    *dst = *src;
+}
+
+#undef X_DST
+#undef Y_DST
+#undef Z_DST
+#undef W_DST
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
diff --git a/src/core/CL/cl_kernels/common/tile.cl b/src/core/CL/cl_kernels/common/tile.cl
new file mode 100644
index 0000000000..4d8f802ea1
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/tile.cl
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2018-2021, 2023-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void tile(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output);
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input);
+
+    // For all coordinates but x, each tile copies from the input
+    const int y     = get_global_id(1);
+    const int z     = get_global_id(2) % DST_DEPTH;
+    const int batch = get_global_id(2) / DST_DEPTH;
+
+#if defined(VEC_SIZE) && defined(OFFSET)
+    // If we are loading/storing multiple elements at time, we need to
+    // not exceed the input boundaries. The last threads need to backtrack
+    // of OFFSET elements. Those elements cumulates for previous tiles
+
+    const int id          = (int)(get_global_id(0));
+    const int multiple_no = id / SRC_WIDTH_TILES;
+    const int tile_no     = id % SRC_WIDTH_TILES;
+    const int last_tile   = (int)(tile_no == SRC_WIDTH_TILES - 1);
+
+    const int x_input  = tile_no * VEC_SIZE - last_tile * OFFSET;
+    const int x_output = multiple_no * SRC_WIDTH + x_input;
+
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x_output, y, z, batch);
+
+    // Copy the data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+#else  // !defined(VEC_SIZE) || !defined(OFFSET)
+    const int x = get_global_id(0);
+
+    // Update the input and output pointers.
+    input.ptr  = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+    output.ptr = tensor4D_offset(&output, x, y, z, batch);
+
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(OFFSET)
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
diff --git a/src/core/CL/cl_kernels/common/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl
new file mode 100644
index 0000000000..5b4c68ca10
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/transpose.cl
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define PARTIAL_STORE_M0 VEC_SIZE_LEFTOVER_X
+#define PARTIAL_STORE_N0 VEC_SIZE_LEFTOVER_Y
+
+#include "helpers.h"
+#include "repeat.h"
+
+#if defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
+
+#if VEC_SIZE_X == 1
+#if VEC_SIZE_Y == 1
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0                \
+    }
+#elif VEC_SIZE_Y == 2
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1            \
+    }
+#elif VEC_SIZE_Y == 3
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1, u2        \
+    }
+#elif VEC_SIZE_Y == 4
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1, u2, u3    \
+    }
+#elif VEC_SIZE_Y == 8
+#define TRANSPOSED_U(val)              \
+    {                                  \
+        u0, u1, u2, u3, u4, u5, u6, u7 \
+    }
+#elif VEC_SIZE_Y == 16
+#define TRANSPOSED_U(val)                        \
+    {                                            \
+        u0, u1, u2, u3, u4, u5, u6, u7,          \
+        u8, u9, u10, u11, u12, u13, u14, u15 \
+    }
+#endif /* switch VEC_SIZE_Y */
+#else  // VEC_SIZE_X == 1
+#if VEC_SIZE_Y == 1
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0.val            \
+    }
+#elif VEC_SIZE_Y == 2
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0.val, u1.val    \
+    }
+#elif VEC_SIZE_Y == 3
+#define TRANSPOSED_U(val)      \
+    {                          \
+        u0.val, u1.val, u2.val \
+    }
+#elif VEC_SIZE_Y == 4
+#define TRANSPOSED_U(val)              \
+    {                                  \
+        u0.val, u1.val, u2.val, u3.val \
+    }
+#elif VEC_SIZE_Y == 8
+#define TRANSPOSED_U(val)                                              \
+    {                                                                  \
+        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val \
+    }
+#elif VEC_SIZE_Y == 16
+#define TRANSPOSED_U(val)                                                        \
+    {                                                                            \
+        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val,          \
+        u8.val, u9.val, u10.val, u11.val, u12.val, u13.val, u14.val, u15.val \
+    }
+#endif /* switch VEC_SIZE_Y */
+#endif // VEC_SIZE_X == 1
+
+#if DATA_TYPE_IN_BYTES == 4
+#define DATA_TYPE uint
+#elif DATA_TYPE_IN_BYTES == 2
+#define DATA_TYPE ushort
+#elif DATA_TYPE_IN_BYTES == 1
+#define DATA_TYPE uchar
+#else /* switch DATA_TYPE_IN_BYTES */
+#error DATA_TYPE_IN_BYTES not supported for transpose
+#endif /* switch DATA_TYPE_IN_BYTES */
+
+/** This OpenCL kernel computes the matrix transposition of input matrix
+ *
+ * @note The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
+ *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
+ *  -# -DVEC_SIZE_X is the number of elements processed in X dimension
+ *  -# -DVEC_SIZE_LEFTOVER_X is the leftover size in the X dimension; x_dimension % VEC_SIZE_X
+ *  -# -DVEC_SIZE_Y is the number of elements processed in Y dimension
+ *  -# -DVEC_SIZE_LEFTOVER_Y is the leftover size in the Y dimension; y_dimension % VEC_SIZE_Y
+ *
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination matrix in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_gx_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void transpose(TENSOR3D_DECLARATION(src),
+                        TENSOR3D_DECLARATION(dst))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+    uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0);
+    uint z_offs = get_global_id(2);
+
+    // Compute addresses
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y + z_offs * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y + z_offs * dst_stride_z;
+
+    // Load the NxM block at (x, y)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u0 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)src_addr);
+#if VEC_SIZE_Y > 1
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u1 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + src_stride_y));
+#endif /* VEC_SIZE_Y > 1 */
+#if VEC_SIZE_Y > 2
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u2 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif /* VEC_SIZE_Y > 2 */
+#if VEC_SIZE_Y > 3
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u3 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif /* VEC_SIZE_Y > 3 */
+#if VEC_SIZE_Y > 4
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u4 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u5 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u6 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u7 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif /* VEC_SIZE_Y > 4 */
+#if VEC_SIZE_Y > 8
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u8 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u9 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 9 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u10 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 10 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u11 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 11 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u12 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 12 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u13 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 13 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u14 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 14 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u15 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 15 * src_stride_y));
+#endif /* VEC_SIZE_Y > 8 */
+
+    //Create transposed vectors
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t0 = TRANSPOSED_U(s0);
+#if VEC_SIZE_X > 1
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t1 = TRANSPOSED_U(s1);
+#endif /* VEC_SIZE_X > 1 */
+#if VEC_SIZE_X > 2
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t2 = TRANSPOSED_U(s2);
+#endif /* VEC_SIZE_X > 2 */
+#if VEC_SIZE_X > 3
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t3 = TRANSPOSED_U(s3);
+#endif /* VEC_SIZE_X > 3 */
+#if VEC_SIZE_X > 4
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t4 = TRANSPOSED_U(s4);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t5 = TRANSPOSED_U(s5);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t6 = TRANSPOSED_U(s6);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t7 = TRANSPOSED_U(s7);
+#endif /* VEC_SIZE_X > 4 */
+#if VEC_SIZE_X > 8
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t8 = TRANSPOSED_U(s8);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t9 = TRANSPOSED_U(s9);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tA = TRANSPOSED_U(sA);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tB = TRANSPOSED_U(sB);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tC = TRANSPOSED_U(sC);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tD = TRANSPOSED_U(sD);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tE = TRANSPOSED_U(sE);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tF = TRANSPOSED_U(sF);
+#endif /* VEC_SIZE_X > 8 */
+
+    // Store the block at (y, x)
+    REPEAT_VAR_INIT_TO_CONST(VEC_SIZE_X, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    STORE_BLOCK_BOUNDARY_AWARE(VEC_SIZE_X, VEC_SIZE_Y, DATA_TYPE, t, (__global uchar *)dst_addr, dst_stride_y, zout, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_Y, VEC_SIZE_LEFTOVER_X != 0
+                               && get_global_id(0) == 0,
+                               VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0);
+}
+
+#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
diff --git a/src/core/CL/cl_kernels/common/unpooling_layer.cl b/src/core/CL/cl_kernels/common/unpooling_layer.cl
new file mode 100644
index 0000000000..6662dc9360
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/unpooling_layer.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+/** Performs max unpooling function with pool size equal to 2.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
+ * @note The width of the output tensor must be passed using -DWIDTH_DST e.g. -DWIDTH_DST=24
+ * @note The height of the output tensor must be passed using -DHEIGHT_DST e.g. -DHEIGHT_DST=54
+ * @note The depth of the output tensor must be passed using -DDEPTH_DST e.g. -DDEPTH_DST=32
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the output tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the output tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the output tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the output tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the output tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void max_unpooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    unsigned int index = *((__global unsigned int *)indices.ptr);
+    DATA_TYPE value    = *((__global DATA_TYPE *)input.ptr);
+
+    *((__global DATA_TYPE *)tensor3D_index2ptr(&output, WIDTH_DST, HEIGHT_DST, DEPTH_DST, index)) = value;
+}
+\ No newline at end of file