Reorganize the kernels into nhwc, nchw and common folders

The Following kernels have been split into nchw/nhwc kernels files: - batchnormalization_layer - batch_to_space - channel_shuffle - depth_to_space - dequantization_layer - im2col - normalization_layer - normalize_planar_yuv_layer - normalize_planar_yuv_layer_quantized - pooling_layer - pooling_layer_quantized - remap - reorg_layer - scale - scale_quantized - space_to_batch - space_to_depth - upsample_layer - winograd_filter_transform - winograd_input_transform - winograd_output_transform The following kernels have been moved to nchw folder: - direct_convolution1x1 - direct_convolution3x3 - direct_convolution5x5 - direct_convolution_quantized - prior_box_layer The following kernels have been moved to nhwc folder: - direct_convolution - dwc_native_fp_nhwc - dwc_native_quantized_nhwc The following kernels have been removed: - sobel_filter While the rest kerenls have been moved to the common folder. Partially resolves COMPMID-4453 Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com> Change-Id: Ic327ac935687ec351c610c65a3c6357f364a5a58 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5919 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Adnan AlSinan <adnan.alsinan@arm.com> 2021-07-05 13:12:52 +0100
committer: Georgios Pinitas <georgios.pinitas@arm.com> 2021-07-25 13:04:23 +0000
commit: 7075fe2c5ee6f7cfe7cfd9454d905235e70b9ac4 (patch)
tree: b65671bdf37eb1ef8cc30ef64ab572da795546fa
parent: 22f5ed51f1b01f7cf6993a556a0b763e437926fc (diff)
download: ComputeLibrary-7075fe2c5ee6f7cfe7cfd9454d905235e70b9ac4.tar.gz
118 files changed, 7755 insertions, 7245 deletions
diff --git a/SConscript b/SConscript
index 886ad083ad..682f55310e 100644
--- a/SConscript
+++ b/SConscript
@@ -228,10 +228,143 @@ cpp_compiler = os.environ.get('CXX', default_cpp_compiler)
 # Generate embed files
 generate_embed = [ version_file ]
 if env['opencl'] and env['embed_kernels']:
-    cl_files = Glob('src/core/CL/cl_kernels/*.cl')
-    cl_files += Glob('src/core/CL/cl_kernels/*.h')
-
-    embed_files = [ f.get_path()+"embed" for f in cl_files ]
+    
+    # Header files
+    cl_helper_files = [ 'src/core/CL/cl_kernels/activation_float_helpers.h',
+                        'src/core/CL/cl_kernels/activation_quant_helpers.h',
+                        'src/core/CL/cl_kernels/gemm_helpers.h',
+                        'src/core/CL/cl_kernels/helpers_asymm.h',
+                        'src/core/CL/cl_kernels/helpers.h',
+                        'src/core/CL/cl_kernels/load_store_utility.h',
+                        'src/core/CL/cl_kernels/repeat.h',
+                        'src/core/CL/cl_kernels/tile_helpers.h',
+                        'src/core/CL/cl_kernels/types.h',
+                        'src/core/CL/cl_kernels/warp_helpers_quantized.h',
+                        'src/core/CL/cl_kernels/warp_helpers.h'
+                    ]
+
+    # Common kernels
+    cl_files_common = ['src/core/CL/cl_kernels/common/activation_layer.cl',
+                       'src/core/CL/cl_kernels/common/activation_layer_quant.cl',
+                       'src/core/CL/cl_kernels/common/arg_min_max.cl',
+                       'src/core/CL/cl_kernels/common/batchnormalization_layer.cl',
+                       'src/core/CL/cl_kernels/common/bounding_box_transform.cl',
+                       'src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl',
+                       'src/core/CL/cl_kernels/common/bitwise_op.cl',
+                       'src/core/CL/cl_kernels/common/cast.cl',
+                       'src/core/CL/cl_kernels/common/comparisons.cl',
+                       'src/core/CL/cl_kernels/common/concatenate.cl',
+                       'src/core/CL/cl_kernels/common/col2im.cl',
+                       'src/core/CL/cl_kernels/common/convert_fc_weights.cl',
+                       'src/core/CL/cl_kernels/common/copy_tensor.cl',
+                       'src/core/CL/cl_kernels/common/crop_tensor.cl',
+                       'src/core/CL/cl_kernels/common/deconvolution_layer.cl',
+                       'src/core/CL/cl_kernels/common/dequantization_layer.cl',
+                       'src/core/CL/cl_kernels/common/elementwise_operation.cl',
+                       'src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl',
+                       'src/core/CL/cl_kernels/common/elementwise_unary.cl',
+                       'src/core/CL/cl_kernels/common/fft_digit_reverse.cl',
+                       'src/core/CL/cl_kernels/common/fft.cl',
+                       'src/core/CL/cl_kernels/common/fft_scale.cl',
+                       'src/core/CL/cl_kernels/common/fill_border.cl',
+                       'src/core/CL/cl_kernels/common/floor.cl',
+                       'src/core/CL/cl_kernels/common/gather.cl',
+                       'src/core/CL/cl_kernels/common/gemm.cl',
+                       'src/core/CL/cl_kernels/common/gemv.cl',
+                       'src/core/CL/cl_kernels/common/gemm_v1.cl',
+                       'src/core/CL/cl_kernels/common/gemmlowp.cl',
+                       'src/core/CL/cl_kernels/common/generate_proposals.cl',
+                       'src/core/CL/cl_kernels/common/generate_proposals_quantized.cl',
+                       'src/core/CL/cl_kernels/common/instance_normalization.cl',
+                       'src/core/CL/cl_kernels/common/l2_normalize.cl',
+                       'src/core/CL/cl_kernels/common/mean_stddev_normalization.cl',
+                       'src/core/CL/cl_kernels/common/unpooling_layer.cl',
+                       'src/core/CL/cl_kernels/common/memset.cl',
+                       'src/core/CL/cl_kernels/common/nonmax.cl',
+                       'src/core/CL/cl_kernels/common/minmax_layer.cl',
+                       'src/core/CL/cl_kernels/common/pad_layer.cl',
+                       'src/core/CL/cl_kernels/common/permute.cl',
+                       'src/core/CL/cl_kernels/common/pixelwise_mul_float.cl',
+                       'src/core/CL/cl_kernels/common/pixelwise_mul_int.cl',
+                       'src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl',
+                       'src/core/CL/cl_kernels/common/quantization_layer.cl',
+                       'src/core/CL/cl_kernels/common/range.cl',
+                       'src/core/CL/cl_kernels/common/reduction_operation.cl',
+                       'src/core/CL/cl_kernels/common/pooling_layer.cl',
+                       'src/core/CL/cl_kernels/common/reshape_layer.cl',
+                       'src/core/CL/cl_kernels/common/convolution_layer.cl',
+                       'src/core/CL/cl_kernels/common/reverse.cl',
+                       'src/core/CL/cl_kernels/common/roi_align_layer.cl',
+                       'src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl',
+                       'src/core/CL/cl_kernels/common/roi_pooling_layer.cl',
+                       'src/core/CL/cl_kernels/common/select.cl',
+                       'src/core/CL/cl_kernels/common/softmax_layer.cl',
+                       'src/core/CL/cl_kernels/common/softmax_layer_quantized.cl',
+                       'src/core/CL/cl_kernels/common/stack_layer.cl',
+                       'src/core/CL/cl_kernels/common/slice_ops.cl',
+                       'src/core/CL/cl_kernels/common/tile.cl',
+                       'src/core/CL/cl_kernels/common/transpose.cl'
+                    ]
+
+    # NCHW kernels
+    cl_files_nchw = ['src/core/CL/cl_kernels/nchw/batch_to_space.cl',
+                    'src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/channel_shuffle.cl',
+                    'src/core/CL/cl_kernels/nchw/depth_to_space.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl',
+                    'src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl',
+                    'src/core/CL/cl_kernels/nchw/dequantization_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/im2col.cl',
+                    'src/core/CL/cl_kernels/nchw/normalization_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl',
+                    'src/core/CL/cl_kernels/nchw/pooling_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl',
+                    'src/core/CL/cl_kernels/nchw/prior_box_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/remap.cl',
+                    'src/core/CL/cl_kernels/nchw/reorg_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/scale.cl',
+                    'src/core/CL/cl_kernels/nchw/scale_quantized.cl',
+                    'src/core/CL/cl_kernels/nchw/space_to_batch.cl',
+                    'src/core/CL/cl_kernels/nchw/space_to_depth.cl',
+                    'src/core/CL/cl_kernels/nchw/upsample_layer.cl',
+                    'src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl',
+                    'src/core/CL/cl_kernels/nchw/winograd_input_transform.cl',
+                    'src/core/CL/cl_kernels/nchw/winograd_output_transform.cl'
+                ]
+
+    # NHWC kernels
+    cl_files_nhwc = ['src/core/CL/cl_kernels/nhwc/batch_to_space.cl',
+                    'src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/channel_shuffle.cl',
+                    'src/core/CL/cl_kernels/nhwc/direct_convolution.cl',
+                    'src/core/CL/cl_kernels/nhwc/depth_to_space.cl',
+                    'src/core/CL/cl_kernels/nhwc/dequantization_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl',
+                    'src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl',
+                    'src/core/CL/cl_kernels/nhwc/im2col.cl',
+                    'src/core/CL/cl_kernels/nhwc/normalization_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl',
+                    'src/core/CL/cl_kernels/nhwc/pooling_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl',
+                    'src/core/CL/cl_kernels/nhwc/remap.cl',
+                    'src/core/CL/cl_kernels/nhwc/reorg_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/scale.cl',
+                    'src/core/CL/cl_kernels/nhwc/scale_quantized.cl',
+                    'src/core/CL/cl_kernels/nhwc/space_to_batch.cl',
+                    'src/core/CL/cl_kernels/nhwc/space_to_depth.cl',
+                    'src/core/CL/cl_kernels/nhwc/upsample_layer.cl',
+                    'src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl',
+                    'src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl',
+                    'src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl'
+                ]
+
+    cl_files = cl_helper_files + cl_files_common + cl_files_nchw + cl_files_nhwc
+
+    embed_files = [ f+"embed" for f in cl_files ]
     arm_compute_env.Append(CPPPATH =[Dir("./src/core/CL/").path] )
 
     generate_embed.append(arm_compute_env.Command(embed_files, cl_files, action=resolve_includes))
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 27e28618b9..f70ec931eb 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -2068,7 +2068,7 @@ SEARCH_INCLUDES        = YES
 # preprocessor.
 # This tag requires that the tag SEARCH_INCLUDES is set to YES.
 
-INCLUDE_PATH           = 
+INCLUDE_PATH           = ./src/core/CL/cl_kernels/
 
 # You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
 # patterns (like *.h and *.hpp) to filter out the header-files in the
diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl
deleted file mode 100644
index 8a71985b02..0000000000
--- a/src/core/CL/cl_kernels/batch_to_space.cl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    VECTOR_DECLARATION(block_shape),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
-    const int block_x = *((__global int *)vector_offset(&block, 0));
-    const int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    VECTOR_DECLARATION(block_shape),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
-    const int block_x = *((__global int *)vector_offset(&block, 0));
-    const int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int block_x = BLOCK_SHAPE_X;
-    const int block_y = BLOCK_SHAPE_Y;
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int block_x = BLOCK_SHAPE_X;
-    const int block_y = BLOCK_SHAPE_Y;
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
deleted file mode 100644
index 89cbe4440e..0000000000
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define INVSQRT_OP(a) rsqrt((a))
-#define SQCVT_SAT(a) (a)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-
-/** Apply batch normalization.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
- * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
-                                            TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
-                                            VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-                                            VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
-                                            float epsilon)
-{
-    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D out = in;
-#else  /* IN_PLACE */
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
-#ifndef USE_DEFAULT_BETA
-    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
-#endif /* USE_DEFAULT_GAMMA */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    denominator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    numerator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    x_bar = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res = 0;
-
-    const int current_slice = get_global_id(2);
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
-    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
-    // Calculate x bar and store results
-    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
-    numerator = SUB_OP(data, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
-
-    res = MUL_OP(gamma_vec, x_bar);
-#else  /* USE_DEFAULT_GAMMA */
-    // gamma is equal to 1, no need to perform multiplications
-    res                         = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
-    // beta is not zero, hence we need to perform the addition
-    res = ADD_OP(res, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
-    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Apply batch normalization on tensors with NHWC format.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
- * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
-                                            TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
-                                            VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-                                            VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
-                                            float epsilon)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_ptr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
-    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
-#ifndef USE_DEFAULT_BETA
-    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_GAMMA */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    denominator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    numerator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    x_bar = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = 0;
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
-    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
-    // Calculate x bar and store results
-    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
-    numerator = SUB_OP(data, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
-
-    res0 = MUL_OP(gamma_vec, x_bar);
-#else  /* USE_DEFAULT_GAMMA */
-    // gamma is equal to 1, no need to perform multiplications
-    res0 = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
-    // beta is not zero, hence we need to perform the addition
-    res0 = ADD_OP(res0, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
-
-#if defined(DATA_TYPE) && defined(EPSILON)
-/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
- *
- * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
- * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
- *       For depthwise convolution weight do not pass DIM2
- * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
- * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
- *
- * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
- * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
- * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
- * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
- * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
- * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
- * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
- * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
- * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
- * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
- * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
- * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
- * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
- * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
- * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
- * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
- * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
- * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
- * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
- */
-__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
-#if defined(BIAS)
-                                            VECTOR_DECLARATION(b),
-#endif // defined(BIAS)
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var)
-#ifndef IN_PLACE_W
-                                            ,
-                                            TENSOR3D_DECLARATION(w_fused)
-#endif // ifndef IN_PLACE_W
-#ifndef IN_PLACE_B
-                                            ,
-                                            VECTOR_DECLARATION(b_fused)
-#endif // ifndef IN_PLACE_B
-#if defined(BETA)
-                                            ,
-                                            VECTOR_DECLARATION(beta)
-#endif // defined(BETA)
-#if defined(GAMMA)
-                                            ,
-                                            VECTOR_DECLARATION(gamma)
-#endif // defined(GAMMA)
-                                           )
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-#if defined(DIM2)
-    int c0 = z % DIM2;
-    int c1 = z / DIM2;
-#else // ! defined(DIM2)
-    int c0                                                                                    = 0;
-#if defined(NHWC)
-    int c1                                                                                    = x;
-#else  // defined(NHWC)
-    int c1 = z;
-#endif // defined(NHWC)
-#endif // defined(DIM2)
-
-    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
-    int v_offset = c1 * sizeof(DATA_TYPE);
-
-    DATA_TYPE w_old = 0.0f;
-    DATA_TYPE b_old = 0.0f;
-    DATA_TYPE w_new = 0.0f;
-    DATA_TYPE b_new = 0.0f;
-    DATA_TYPE gamma = 1.0f;
-    DATA_TYPE mean  = 0.0f;
-    DATA_TYPE var   = 1.0f;
-    DATA_TYPE beta  = 0.0f;
-
-    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
-    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
-    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
-
-#if defined(GAMMA)
-    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
-#endif // defined(GAMMA)
-
-    // Compute new weight
-    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
-
-#if defined(IN_PLACE_W)
-    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
-#else  // defined(IN_PLACE_W)
-    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
-#endif // defined(IN_PLACE_W)
-
-    // Compute bias
-#if !defined(DIM2) && defined(NHWC)
-    if(z == 0 && y == 0)
-#else  // !defined(DIM2) && defined(NHWC)
-    if(x == 0 && y == 0 && c0 == 0)
-#endif // !defined(DIM2) && defined(NHWC)
-    {
-#if defined(BIAS)
-        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
-#endif // defined(BIAS)
-#if defined(BETA)
-        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
-#endif // defined(BETA)
-
-        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
-
-#if defined(BIAS)
-
-#if defined(IN_PLACE_B)
-        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
-#else  // defined(IN_PLACE_B)
-        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // defined(IN_PLACE_B)
-
-#else // defined(BIAS)
-
-#ifndef IN_PLACE_B
-        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // ifndef IN_PLACE_B
-
-#endif // defined(BIAS)
-    }
-}
-#endif // defined(DATA_TYPE) && defined(EPSILON)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/common/activation_layer.cl
index bc2c99b6c8..a04556a1ed 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/common/activation_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
index 66261019ab..38ee00b17a 100644
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/arg_min_max.cl b/src/core/CL/cl_kernels/common/arg_min_max.cl
index 6e57ed0af1..6e57ed0af1 100644
--- a/src/core/CL/cl_kernels/arg_min_max.cl
+++ b/src/core/CL/cl_kernels/common/arg_min_max.cl
diff --git a/src/core/CL/cl_kernels/common/batchnormalization_layer.cl b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
new file mode 100644
index 0000000000..18f54907df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(EPSILON)
+/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
+ *
+ * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
+ * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
+ *       For depthwise convolution weight do not pass DIM2
+ * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
+ * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
+ * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
+ * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
+ * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
+ * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
+ * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
+ * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
+ * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
+ * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
+ * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
+ * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
+ * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
+#if defined(BIAS)
+                                            VECTOR_DECLARATION(b),
+#endif // defined(BIAS)
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var)
+#ifndef IN_PLACE_W
+                                            ,
+                                            TENSOR3D_DECLARATION(w_fused)
+#endif // ifndef IN_PLACE_W
+#ifndef IN_PLACE_B
+                                            ,
+                                            VECTOR_DECLARATION(b_fused)
+#endif // ifndef IN_PLACE_B
+#if defined(BETA)
+                                            ,
+                                            VECTOR_DECLARATION(beta)
+#endif // defined(BETA)
+#if defined(GAMMA)
+                                            ,
+                                            VECTOR_DECLARATION(gamma)
+#endif // defined(GAMMA)
+                                           )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+#if defined(DIM2)
+    int c0 = z % DIM2;
+    int c1 = z / DIM2;
+#else // ! defined(DIM2)
+    int c0                                                                                    = 0;
+#if defined(NHWC)
+    int c1                                                                                    = x;
+#else  // defined(NHWC)
+    int c1 = z;
+#endif // defined(NHWC)
+#endif // defined(DIM2)
+
+    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
+    int v_offset = c1 * sizeof(DATA_TYPE);
+
+    DATA_TYPE w_old = 0.0f;
+    DATA_TYPE b_old = 0.0f;
+    DATA_TYPE w_new = 0.0f;
+    DATA_TYPE b_new = 0.0f;
+    DATA_TYPE gamma = 1.0f;
+    DATA_TYPE mean  = 0.0f;
+    DATA_TYPE var   = 1.0f;
+    DATA_TYPE beta  = 0.0f;
+
+    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
+    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
+    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
+
+#if defined(GAMMA)
+    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
+#endif // defined(GAMMA)
+
+    // Compute new weight
+    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
+
+#if defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
+#else  // defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
+#endif // defined(IN_PLACE_W)
+
+    // Compute bias
+#if !defined(DIM2) && defined(NHWC)
+    if(z == 0 && y == 0)
+#else  // !defined(DIM2) && defined(NHWC)
+    if(x == 0 && y == 0 && c0 == 0)
+#endif // !defined(DIM2) && defined(NHWC)
+    {
+#if defined(BIAS)
+        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
+#endif // defined(BIAS)
+#if defined(BETA)
+        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
+#endif // defined(BETA)
+
+        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
+
+#if defined(BIAS)
+
+#if defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
+#else  // defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // defined(IN_PLACE_B)
+
+#else // defined(BIAS)
+
+#ifndef IN_PLACE_B
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // ifndef IN_PLACE_B
+
+#endif // defined(BIAS)
+    }
+}
+#endif // defined(DATA_TYPE) && defined(EPSILON)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/common/bitwise_op.cl
index a600bced9e..e142c1d275 100644
--- a/src/core/CL/cl_kernels/bitwise_op.cl
+++ b/src/core/CL/cl_kernels/common/bitwise_op.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
index f2e9cb0ed0..f2e9cb0ed0 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
index c1d45a56b9..c1d45a56b9 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
diff --git a/src/core/CL/cl_kernels/cast.cl b/src/core/CL/cl_kernels/common/cast.cl
index 036a683ec7..036a683ec7 100644
--- a/src/core/CL/cl_kernels/cast.cl
+++ b/src/core/CL/cl_kernels/common/cast.cl
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/common/col2im.cl
index 59c2d8a3aa..89054dcb31 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/common/col2im.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/common/comparisons.cl
index 408846144d..f05cb87835 100644
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ b/src/core/CL/cl_kernels/common/comparisons.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/common/concatenate.cl
index d2e65408dc..394b20c739 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/common/concatenate.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
index a451c0213b..01ef04a7d6 100644
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/common/convolution_layer.cl
index cfd1f12328..be76929ac8 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/common/convolution_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/common/copy_tensor.cl
index 9c90969827..753b98d1b0 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/common/copy_tensor.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/common/crop_tensor.cl
index d9090dc838..d9090dc838 100644
--- a/src/core/CL/cl_kernels/crop_tensor.cl
+++ b/src/core/CL/cl_kernels/common/crop_tensor.cl
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
index b1d5e61476..4ac5e3f0e9 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/common/dequantization_layer.cl b/src/core/CL/cl_kernels/common/dequantization_layer.cl
new file mode 100644
index 0000000000..7fa62577ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/dequantization_layer.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void dequantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = SCALE;
+
+    const VEC_DATA_TYPE(int, VEC_SIZE)
+    voffset = OFFSET;
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/common/elementwise_operation.cl
index 45dcbfc6e2..45dcbfc6e2 100644
--- a/src/core/CL/cl_kernels/elementwise_operation.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_operation.cl
diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
index a11be80875..a11be80875 100644
--- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl
index d2d9d97d33..d2d9d97d33 100644
--- a/src/core/CL/cl_kernels/elementwise_unary.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl
diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/common/fft.cl
index 51763a620a..3f26d0f1a6 100644
--- a/src/core/CL/cl_kernels/fft.cl
+++ b/src/core/CL/cl_kernels/common/fft.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
index de566212c6..5f64d95bf9 100644
--- a/src/core/CL/cl_kernels/fft_digit_reverse.cl
+++ b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/common/fft_scale.cl
index 57e25ef504..c799dd3b9e 100644
--- a/src/core/CL/cl_kernels/fft_scale.cl
+++ b/src/core/CL/cl_kernels/common/fft_scale.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/common/fill_border.cl
index 5775d899e8..a43343c9f4 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/common/fill_border.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/common/floor.cl
index f6dd4edd2e..f6dd4edd2e 100644
--- a/src/core/CL/cl_kernels/floor.cl
+++ b/src/core/CL/cl_kernels/common/floor.cl
diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/common/gather.cl
index 41f439cb47..76eaefa92e 100644
--- a/src/core/CL/cl_kernels/gather.cl
+++ b/src/core/CL/cl_kernels/common/gather.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index 10435d376f..10435d376f 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/common/gemm_v1.cl
index a136a1b96b..a136a1b96b 100644
--- a/src/core/CL/cl_kernels/gemm_v1.cl
+++ b/src/core/CL/cl_kernels/common/gemm_v1.cl
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
index 5cafb5389c..5cafb5389c 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/common/gemv.cl
index aaa83975f8..71a372eb29 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/common/gemv.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl
index e8306c55a8..5b8502072a 100644
--- a/src/core/CL/cl_kernels/generate_proposals.cl
+++ b/src/core/CL/cl_kernels/common/generate_proposals.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
index 04264197f4..70f861c4b7 100644
--- a/src/core/CL/cl_kernels/generate_proposals_quantized.cl
+++ b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/common/instance_normalization.cl
index adfbebd67d..adfbebd67d 100644
--- a/src/core/CL/cl_kernels/instance_normalization.cl
+++ b/src/core/CL/cl_kernels/common/instance_normalization.cl
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/common/l2_normalize.cl
index fbe3406239..fbe3406239 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/common/l2_normalize.cl
diff --git a/src/core/CL/cl_kernels/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
index 76be629934..05727a6aa6 100644
--- a/src/core/CL/cl_kernels/mean_stddev_normalization.cl
+++ b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, 2021 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/common/memset.cl
index bb46a49f84..9ff25f3af4 100644
--- a/src/core/CL/cl_kernels/memset.cl
+++ b/src/core/CL/cl_kernels/common/memset.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/common/minmax_layer.cl
index 655696f9a1..49356451df 100644
--- a/src/core/CL/cl_kernels/minmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/minmax_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/common/nonmax.cl
index ab13131807..702e635a89 100644
--- a/src/core/CL/cl_kernels/nonmax.cl
+++ b/src/core/CL/cl_kernels/common/nonmax.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/common/pad_layer.cl
index 903e924a2f..5ae4ec884d 100644
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ b/src/core/CL/cl_kernels/common/pad_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/common/permute.cl
index db9e7ecc25..a03eeb1a19 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/common/permute.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
index 10875293a9..10875293a9 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
index 6d1c2d0c79..6d1c2d0c79 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
diff --git a/src/core/CL/cl_kernels/common/pooling_layer.cl b/src/core/CL/cl_kernels/common/pooling_layer.cl
new file mode 100644
index 0000000000..5122f2c251
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pooling_layer.cl
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if STRIDE_X == 1
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
+#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
+#endif /* STRIDE_X == 3 */
+
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+#define POOLING3x3_STRIDE1(res, input, output)                                                                                                       \
+    ({                                                                                                                                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                               \
+        data00 = POW2_OP(data00, 4);                                                                                                                 \
+        data01 = POW2_OP(data01, 2);                                                                                                                 \
+        data10 = POW2_OP(data10, 4);                                                                                                                 \
+        data11 = POW2_OP(data11, 2);                                                                                                                 \
+        data20 = POW2_OP(data20, 4);                                                                                                                 \
+        data21 = POW2_OP(data21, 2);                                                                                                                 \
+        \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                              \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                                      \
+        values01 = POOL_OP(values01, values11);                                                                                                      \
+        values00 = POOL_OP(values00, values20);                                                                                                      \
+        values01 = POOL_OP(values01, values21);                                                                                                      \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
+    })
+
+#define POOLING3x3_STRIDE2(res, input, output)                                                                                                       \
+    ({                                                                                                                                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data00               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                     \
+        ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8));                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data10               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                     \
+        ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8));                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data20               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                     \
+        ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8));                                       \
+        data00               = POW2_OP(data00, 8);                                                                                                   \
+        data01               = POW2_OP(data01, 1);                                                                                                   \
+        data10               = POW2_OP(data10, 8);                                                                                                   \
+        data11               = POW2_OP(data11, 1);                                                                                                   \
+        data20               = POW2_OP(data20, 8);                                                                                                   \
+        data21               = POW2_OP(data21, 1);                                                                                                   \
+        \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01);                                                                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11);                                                                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21);                                                                           \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                                      \
+        values01 = POOL_OP(values01, values11);                                                                                                      \
+        values00 = POOL_OP(values00, values20);                                                                                                      \
+        values01 = POOL_OP(values01, values21);                                                                                                      \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
+    })
+
+#define POOLING3x3_STRIDE3(res, input, output)                                                                                               \
+    ({                                                                                                                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                       \
+        data00 = POW2_OP(data00, 8);                                                                                                         \
+        data01 = POW2_OP(data01, 4);                                                                                                         \
+        data10 = POW2_OP(data10, 8);                                                                                                         \
+        data11 = POW2_OP(data11, 4);                                                                                                         \
+        data20 = POW2_OP(data20, 8);                                                                                                         \
+        data21 = POW2_OP(data21, 4);                                                                                                         \
+        \
+        data00 = POOL_OP(data00, data10);                                                                                                    \
+        data01 = POOL_OP(data01, data11);                                                                                                    \
+        data00 = POOL_OP(data00, data20);                                                                                                    \
+        data01 = POOL_OP(data01, data21);                                                                                                    \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03));                                                       \
+    })
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 2);
+    data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+
+    // Perform calculations
+    data0             = POOL_OP(data0, data1);
+    ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average or l2 pooling
+    res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+
+/** Performs a pooling function of pool size equal to 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 3);
+    data1 = POW2_OP(data1, 3);
+    data2 = POW2_OP(data2, 3);
+#endif /* defined(POOL_L2) */
+
+    // Perform calculations
+    data0             = POOL_OP(data0, data1);
+    data0             = POOL_OP(data0, data2);
+    ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+
+#if defined(POOLING3x3)
+
+#define CONVERT_OP(data_type) convert_##data_type##4
+#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
+
+VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                     const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int4       start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
+    int        start_y = get_global_id(1) * stride_y - pad_y;
+    const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
+    const int  end_y   = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max((int4)0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
+}
+
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_optimized_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    res;
+
+    // Perform pooling 3x3 for 4 output elements
+    POOLING3x3(res, input, output);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(POOLING3x3)
diff --git a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
index 24cb111772..4494dd8cec 100644
--- a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
+++ b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl
index 3538dae5f0..69cc288c25 100644
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ b/src/core/CL/cl_kernels/common/quantization_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,8 +80,8 @@ __kernel void quantization_layer(
 
     // Create scale and offset vectors
     const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
-    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
-#else // defined(IS_FLOAT)
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset         = OFFSET;
+#else  // defined(IS_FLOAT)
     // Load data
     VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
     val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/common/range.cl
index d25d10e207..d25d10e207 100644
--- a/src/core/CL/cl_kernels/range.cl
+++ b/src/core/CL/cl_kernels/common/range.cl
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
index 9f2c6e23b5..9f2c6e23b5 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/common/reshape_layer.cl
index 2d6a7edade..bfdefc863e 100644
--- a/src/core/CL/cl_kernels/reshape_layer.cl
+++ b/src/core/CL/cl_kernels/common/reshape_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/common/reverse.cl
index 10ffe84aeb..6b0afb9c2c 100644
--- a/src/core/CL/cl_kernels/reverse.cl
+++ b/src/core/CL/cl_kernels/common/reverse.cl
@@ -1,5 +1,5 @@
 /*
-* Copyright (c) 2018-2020 Arm Limited.
+* Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/common/roi_align_layer.cl
index e0b98e68c9..8cfe5ddcb6 100644
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ b/src/core/CL/cl_kernels/common/roi_align_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -173,7 +173,7 @@ __kernel void roi_align_layer(
     const float2 roi_bin_grid = SAMPLING_RATIO;
 #else  // !defined(SAMPLING_RATIO)
     // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
-    const float2   roi_bin_grid = ceil(bin_size - EPS_GRID);
+    const float2 roi_bin_grid           = ceil(bin_size - EPS_GRID);
 #endif // defined(SAMPLING_RATIO)
 
     // Move input and output pointer across the fourth dimension
@@ -184,7 +184,7 @@ __kernel void roi_align_layer(
 #if defined(NHWC)
         __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
 #else  // !defined(NHWC)
-        __global DATA_TYPE *_output_ptr  = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
 #endif // defined(NHWC)
         *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
                                                          region_start.x,
diff --git a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
index d5c9a0d9bf..e75dee06f6 100644
--- a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
index 6899b952e0..6899b952e0 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/common/select.cl
index 6fd4bd4ce3..6fd4bd4ce3 100644
--- a/src/core/CL/cl_kernels/select.cl
+++ b/src/core/CL/cl_kernels/common/select.cl
diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/common/slice_ops.cl
index dc3ffd91c1..d12c60f5ea 100644
--- a/src/core/CL/cl_kernels/slice_ops.cl
+++ b/src/core/CL/cl_kernels/common/slice_ops.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
index 4d2d89dd73..4d2d89dd73 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
index 4d5006d804..4d5006d804 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/common/stack_layer.cl
index 438e858df2..2468bf750d 100644
--- a/src/core/CL/cl_kernels/stack_layer.cl
+++ b/src/core/CL/cl_kernels/common/stack_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/common/tile.cl
index 79da7fe6b9..4332411688 100644
--- a/src/core/CL/cl_kernels/tile.cl
+++ b/src/core/CL/cl_kernels/common/tile.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl
index 82db2908b5..82db2908b5 100644
--- a/src/core/CL/cl_kernels/transpose.cl
+++ b/src/core/CL/cl_kernels/common/transpose.cl
diff --git a/src/core/CL/cl_kernels/unpooling_layer.cl b/src/core/CL/cl_kernels/common/unpooling_layer.cl
index 457e9bf8f1..6662dc9360 100644
--- a/src/core/CL/cl_kernels/unpooling_layer.cl
+++ b/src/core/CL/cl_kernels/common/unpooling_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
deleted file mode 100644
index 127f67d940..0000000000
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-/** This performs the dequantization of 8-bit unsigned integers to floating point.
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
- * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void dequantization_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale and offset vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = SCALE;
-
-    const VEC_DATA_TYPE(int, VEC_SIZE)
-    voffset = OFFSET;
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    __global float *scale)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = scale[get_global_id(2)];
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nhwc(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    __global float *scale)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
index f301e64d66..89129cff3f 100644
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,14 +23,14 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Depth to space transformation. (NCHW)
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -39,6 +39,12 @@
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -48,31 +54,41 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void depth_to_space_nchw(
+__kernel void batch_to_space_nchw(
     TENSOR3D_DECLARATION(input),
     const int batch_id,
+    VECTOR_DECLARATION(block_shape),
     TENSOR4D_DECLARATION(output))
 {
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
 
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int r = (BATCH_SIZE / (block_x * block_y));
     const int x = get_global_id(0);
     const int y = get_global_id(1);
-    const int z = get_global_id(2) % r;
+    const int z = get_global_id(2);
+    const int w = batch_id % r;
 
-    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
-    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
 
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
 }
-/** Depth to space transformation. (NHWC)
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -90,7 +106,7 @@ __kernel void depth_to_space_nchw(
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void depth_to_space_nhwc(
+__kernel void batch_to_space_static_nchw(
     TENSOR3D_DECLARATION(input),
     const int batch_id,
     TENSOR4D_DECLARATION(output))
@@ -98,14 +114,18 @@ __kernel void depth_to_space_nhwc(
     Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0) % r;
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+    const int w = batch_id % r;
 
-    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
-    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
 
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
 }
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
new file mode 100644
index 0000000000..2d466661b3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D out = in;
+#else  /* IN_PLACE */
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res = 0;
+
+    const int current_slice = get_global_id(2);
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+    res = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+    // beta is not zero, hence we need to perform the addition
+    res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/channel_shuffle.cl b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
new file mode 100644
index 0000000000..57d82e1e6f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
@@ -0,0 +1,103 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    uint curr_channel = 0; // channel id of input
+    uint batch_id     = 0; // batch id
+    uint group_id     = 0; // group id
+    uint channel_id   = 0; // channel id within the group
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
+
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+    const uint x = get_global_id(0) * VEC_SIZE;
+    const uint y = get_global_id(1) * 2;
+    const uint z = channel_id * NUM_GROUPS + group_id;
+
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+    // Store blocks
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    VSTORE(VEC_SIZE)
+    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
new file mode 100644
index 0000000000..b9f223fe9d
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/dequantization_layer.cl b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
new file mode 100644
index 0000000000..e0203f7408
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = scale[get_global_id(2)];
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
index 8ab2d1d4ea..8ab2d1d4ea 100644
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
index 811df053c4..811df053c4 100644
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
index 59d668f0bf..59d668f0bf 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
index b80d4f587e..b80d4f587e 100644
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/nchw/im2col.cl
index a1467a0b36..fddf918c63 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/nchw/im2col.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "helpers.h"
-
 #if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
 
 #if ELEMENT_SIZE == 1
@@ -861,500 +860,4 @@ __kernel void im2col_generic_padx0_pady0_nchw(
 #endif // HAS_BIAS
 }
 #endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-
-#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
-
-/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
- *  @name IM2COL1X9_NHWC_STORE
- *
- *  @note To use this macro for a 3x3 block, @p ROW has to be 0
- *
- * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
- * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
- * @param[in] DATA_TYPE            Data type of @p DATA
- * @param[in] SRC_DEPTH            Input channel size / depth
- * @param[in] DATA                 Value variable base name
- * @param[in] ROW                  The row number to store. Supported: 0-8
- * @param[in] OUTPUT_PTR           Output pointer
- * @{
- */
-#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
-    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
-    if(at_channel_boundary)                                                                                          \
-    {                                                                                                                \
-        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    }                                                                                                                \
-    else                                                                                                             \
-    {                                                                                                                \
-        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
-    }
-#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
-#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-
-#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-
-#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-/** @}*/
-
-/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int  yi_coord = 0;
-    int3 offset   = 0;
-
-    // Clamp xi
-    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
-    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
-    xi_offset *= (int3)src_stride_y;
-
-    // Out-of-bound condition for X
-    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
-
-    // yi == 0
-    // Clamp yi
-    // yi_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
-    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
-    // also causes y_cond (y padding condition) to be satisfied
-    yi_coord = yi - (int)PAD_TOP;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with PAD_VALUE
-    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
-    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // yi == 1
-    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
-    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with zeros
-    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // yi == 2
-    // Clamp yi_coord
-    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with PAD_VALUE
-    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // Store in a boundary-aware way to avoid padding
-    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                         \
-    ({                                                                                                       \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
-        \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
-        \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
-        \
-        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
-        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
-        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
-        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
-        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
-        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
-        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
-        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
-        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
-        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
-        \
-        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
-    })
-#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                         \
-    ({                                                                                                       \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
-        \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
-        \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
-        \
-        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
-    })
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col9x9_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int  yi_coord = 0;
-    int8 offset0  = 0;
-    int  offset1  = 0;
-
-    // Clamp xi
-    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
-    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
-
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
-    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
-    xi_offset0 *= (int8)src_stride_y;
-    xi_offset1 *= (int)src_stride_y;
-
-    // Out-of-bound condition for X
-    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
-    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
-    IM2COL1x9(0);
-    IM2COL1x9(1);
-    IM2COL1x9(2);
-    IM2COL1x9(3);
-    IM2COL1x9(4);
-    IM2COL1x9(5);
-    IM2COL1x9(6);
-    IM2COL1x9(7);
-    IM2COL1x9(8);
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int i = 0;
-    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-    {
-        // Clamp yi_coord
-        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
-        yi_coord     = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
-
-        // Out-of-bound condition for Y
-        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
-
-        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
-        {
-            // Clamp xi_coord
-            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
-            xi_coord     = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
-
-            // Out-of-bound condition for X
-            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
-            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
-
-            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-            // Replace with PAD_VALUE if the value is out-of-bound
-            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-            // Store in a boundary-aware way to avoid padding
-#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
-            const bool at_channel_boundary = get_global_id(0) == 0;
-            if(at_channel_boundary)
-            {
-                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
-                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-            }
-            else // at_channel_boundary
-#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
-            {
-                VSTORE(VECTOR_SIZE)
-                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-            }
-            i++;
-        }
-    }
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
new file mode 100644
index 0000000000..0fef98e295
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(NUM_SLICES)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
+                                                 TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+    const int current_slice = get_global_id(2);
+    const int left_slice    = max(-(int)RADIUS, -current_slice);
+    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
+    }
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    VSTORE(VEC_SIZE)
+    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
+
+#if defined(WIDTH_SIZE)
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int current_col = get_global_id(0) << 2;
+    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
+    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
+
+#if defined(IN_MAP_2D)
+    const int current_row = get_global_id(1);
+    const int first_row   = max(-(int)RADIUS, -current_row);
+    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#endif /* defined(IN_MAP_2D) */
+        for(int i = left_pos; i <= right_pos; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    VSTORE(VEC_SIZE)
+    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(WIDTH_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..23a0de76f7
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
+    const DATA_TYPE curr_std  = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    TYPE res  = (data - curr_mean) / curr_std;
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..0f02ef6184
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+#if defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
+    data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+
+    // Perform normalization
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    VSTORE(VEC_SIZE)
+    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#endif // defined(NUM_CHANNELS)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
new file mode 100644
index 0000000000..790ddb381a
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a pooling function of pool size equal to N  (NCHW)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+    vdata               = INITIAL_VALUE;
+    ACC_DATA_TYPE sdata = INITIAL_VALUE;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE_Y; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+        {
+            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            vdata = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE_X; ++x)
+        {
+            ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            sdata = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
+    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+    res               = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
+{
+    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+    const int pad_vert  = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
+
+    const int x = get_global_id(0) * STRIDE_X;
+    const int y = get_global_id(1) * STRIDE_Y;
+    const int z = get_global_id(2);
+
+    //x axis: width, y axis: height, z axis: component
+    const uint padded_offset = input->offset_first_element_in_bytes
+                               + x * input->stride_x
+                               + y * input->stride_y
+                               + z * input->stride_z;
+
+    const uint offset_base = padded_offset
+                             - y * pad_horiz * sizeof(DATA_TYPE)                                               /* Horizontal padding for each row */
+                             - PAD_TENSOR_TOP * input->stride_y                                                /* top padding */
+                             - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
+                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
+    *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
+#else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+    *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
+#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+
+    *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+    return;
+}
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp32(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    // Load data
+    float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
+    float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    float data0_max = POOL_OP(data0.s0, data0.s1);
+    float data1_max = POOL_OP(data1.s0, data1.s1);
+    float res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global float *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_top    = 0;
+    uint offset_bottom = 0;
+
+    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp16(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    // Load data
+    half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
+    half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    half data0_max = POOL_OP(data0.s0, data0.s1);
+    half data1_max = POOL_OP(data1.s0, data1.s1);
+    half res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global half *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_top    = 0;
+    uint offset_bottom = 0;
+
+    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..1440ef3ed1
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NCHW)
+ *
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int8 vdata = INITIAL_VALUE;
+    int  sdata = INITIAL_VALUE;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE_Y; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+        {
+            VEC_TYPE(8)
+            data       = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            int8 data0 = convert_int8(data);
+            vdata      = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE_X; ++x)
+        {
+            DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            int data0      = convert_int(data);
+            sdata          = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
+    res          = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
+#endif /* defined(POOL_AVG) */
+
+    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+    const float result_f32   = convert_float(result_q8);
+    const float input_offset = (float)OFFSET_IN1;
+    const float input_scale  = (float)SCALE_IN1;
+    const float scale_out    = (float)SCALE_OUT;
+    const float offset_out   = (float)OFFSET_OUT;
+    const float in_f32       = (result_f32 - input_offset) * input_scale;
+    const float out_f32      = in_f32 / scale_out + offset_out;
+    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    *(__global DATA_TYPE *)output.ptr = result_q8;
+}
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
index de10decdec..7524ba7b4a 100644
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
diff --git a/src/core/CL/cl_kernels/nchw/remap.cl b/src/core/CL/cl_kernels/nchw/remap.cl
new file mode 100644
index 0000000000..fab88a1682
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/remap.cl
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#ifndef DEPTH_OUT
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_nearest_neighbour_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_bilinear_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
+}
+#endif // DEPTH_OUT
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
index 29344de37a..f66b17c1a6 100644
--- a/src/core/CL/cl_kernels/reorg_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,45 +72,4 @@ __kernel void reorg_layer_nchw(
     int src_offset                   = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
     *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
 }
-
-/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
- * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reorg_layer_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int xo = get_global_id(1);
-    int yo = get_global_id(2);
-    int zo = get_global_id(0);
-    int xi, yi, zi;
-
-    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
-
-    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
-}
 #endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
 \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl
new file mode 100644
index 0000000000..63a53cc4f2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale.cl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = in_x_coords * (float4)(scale.s0);
+    const float4 new_y       = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_nearest_neighbour_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r           = (float2)(scale_x, scale_y);
+    float8       transformed = transform_nearest(get_current_coords(), r);
+#ifdef ALIGN_CORNERS
+    transformed = round(transformed);
+#endif // ALIGN_CORNERS
+    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
+    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = transform_bilinear(get_current_coords(), r);
+    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
new file mode 100644
index 0000000000..946ad65c14
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+#include "warp_helpers_quantized.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
+    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
index cb11786ac4..e162a29bb0 100644
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -93,75 +93,7 @@ __kernel void space_to_batch_nchw(
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
     }
 }
-/** Calculate the space to batch conversion. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
- *
- * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
- * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
- * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
- * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
- * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
- * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
- * @param[in]  batch_id                                  The output tensor batch id
- * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
- */
-__kernel void space_to_batch_nhwc(
-    TENSOR4D_DECLARATION(input),
-    IMAGE_DECLARATION(paddings),
-    VECTOR_DECLARATION(block_shape),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
-    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
-    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
-    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
-
-    int block_x = *((__global int *)vector_offset(&block, 0));
-    int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int out_x = get_global_id(1);
-    const int out_y = get_global_id(2);
-    const int z     = get_global_id(0);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - pad_left_x;
-        const int in_y = pos_y - pad_left_y;
 
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
-    }
-}
 #endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
 
 #if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
@@ -221,60 +153,4 @@ __kernel void space_to_batch_static_nchw(
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
     }
 }
-/** Calculate the space to batch conversion. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
- * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
- * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
- * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[in]  batch_id                             The output tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_batch_static_nhwc(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int block_x = BLOCK_SHAPE_X;
-    int block_y = BLOCK_SHAPE_Y;
-
-    const int out_x = get_global_id(1);
-    const int out_y = get_global_id(2);
-    const int z     = get_global_id(0);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - PAD_LEFT_X;
-        const int in_y = pos_y - PAD_LEFT_Y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
-    }
-}
 #endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/nchw/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
new file mode 100644
index 0000000000..aea02e813b
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/upsample_layer.cl b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
new file mode 100644
index 0000000000..723c491165
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
new file mode 100644
index 0000000000..85eff9e6d9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SRC_DIM_Z)
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0);
+    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+    out0.s3 = (w0.s2);
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out1    = 0.0f;
+    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out2    = 0.0f;
+    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out3    = 0.0f;
+    out3.s0 = (w2.s0);
+    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+    out3.s3 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 16 channels for 3x3 kernels
+    // 4 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0) / 16.f;
+    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s5 = (w0.s2) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (w2.s0) / 4.f;
+    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s5 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 36 channels for 3x3 kernels
+    // 6 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Transform the input tile
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00.s0;
+    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+    out0.s7 = w01;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40.s0;
+    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+    out7.s7 = w41;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+#endif // defined(SRC_DIM_Z)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
index fbb5e95196..8c382183c3 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
@@ -908,893 +908,6 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
 #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
 }
 
-#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 6, 1, in);
-    TILE(DATA_TYPE, 6, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    TILE(DATA_TYPE, 6, 1, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v *= 4.0f;
-    })
-
-    com[0].v = in[2].v - 4.f * in[0].v;
-    com[1].v = in[3].v - 4.f * in[1].v;
-    com[2].v = in[4].v - 4.f * in[2].v;
-    com[3].v = in[5].v - 4.f * in[3].v;
-    com[4].v = in[3].v - in[1].v;
-    com[4].v = com[4].v + com[4].v;
-    com[5].v = in[4].v - in[2].v;
-
-    out[0].v = com[2].v - com[0].v;
-    out[1].v = com[2].v + com[1].v;
-    out[2].v = com[2].v - com[1].v;
-    out[3].v = com[5].v + com[4].v;
-    out[4].v = com[5].v - com[4].v;
-    out[5].v = com[3].v - com[1].v;
-
-    TILE(uint, 6, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 6;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 36, 1, in);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 6, 1, com);
-    TILE(DATA_TYPE, 36, 1, tmp);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        com[0].v         = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
-        com[1].v         = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
-        com[2].v         = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
-        com[3].v         = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
-        com[4].v         = in[3 * 6 + i].v - in[1 * 6 + i].v;
-        com[4].v         = com[4].v + com[4].v;
-        com[5].v         = in[4 * 6 + i].v - in[2 * 6 + i].v;
-        tmp[i + 0 * 6].v = com[2].v - com[0].v;
-        tmp[i + 1 * 6].v = com[2].v + com[1].v;
-        tmp[i + 2 * 6].v = com[2].v - com[1].v;
-        tmp[i + 3 * 6].v = com[5].v + com[4].v;
-        tmp[i + 4 * 6].v = com[5].v - com[4].v;
-        tmp[i + 5 * 6].v = com[3].v - com[1].v;
-    })
-
-    TILE(DATA_TYPE, 36, 1, out);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        com[0].v         = tmp[i * 6 + 2].v - 4.f * tmp[i * 6 + 0].v;
-        com[1].v         = tmp[i * 6 + 3].v - 4.f * tmp[i * 6 + 1].v;
-        com[2].v         = tmp[i * 6 + 4].v - 4.f * tmp[i * 6 + 2].v;
-        com[3].v         = tmp[i * 6 + 5].v - 4.f * tmp[i * 6 + 3].v;
-        com[4].v         = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
-        com[4].v         = com[4].v + com[4].v;
-        com[5].v         = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
-        out[i * 6 + 0].v = com[2].v - com[0].v;
-        out[i * 6 + 1].v = com[2].v + com[1].v;
-        out[i * 6 + 2].v = com[2].v - com[1].v;
-        out[i * 6 + 3].v = com[5].v + com[4].v;
-        out[i * 6 + 4].v = com[5].v - com[4].v;
-        out[i * 6 + 5].v = com[3].v - com[1].v;
-    })
-
-    // Compute destination address
-    TILE(uint, 36, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 36;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 8, 1, in);
-    TILE(DATA_TYPE, 8, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    TILE(DATA_TYPE, 1, 8, com);
-
-    com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
-    com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
-    com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
-    com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
-    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
-    com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
-    out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
-    out[1].s[0] = com[0].s[0] + com[0].s[1];
-    out[2].s[0] = com[0].s[0] - com[0].s[1];
-    out[3].s[0] = com[0].s[3] + com[0].s[2];
-    out[4].s[0] = com[0].s[3] - com[0].s[2];
-    out[5].s[0] = com[0].s[4] + com[0].s[5];
-    out[6].s[0] = com[0].s[4] - com[0].s[5];
-    out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
-
-    TILE(uint, 8, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, 1, in);
-    TILE(DATA_TYPE, 64, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 8, 8, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                                    // x
-        com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];                                    // x
-        com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                 // x
-        com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
-        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
-        com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
-        com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
-    })
-
-    TILE(DATA_TYPE, 8, 8, tmp);
-    tmp[0].v = com[6].v;
-    tmp[1].v = com[0].v + com[1].v;
-    tmp[2].v = com[0].v - com[1].v;
-    tmp[3].v = com[2].v + com[3].v;
-    tmp[4].v = com[2].v - com[3].v;
-    tmp[5].v = com[4].v + com[5].v;
-    tmp[6].v = com[4].v - com[5].v;
-    tmp[7].v = com[7].v;
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[0]         = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[1]         = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
-        com[0].s[2]         = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
-        com[0].s[3]         = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[5]         = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
-        out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
-        out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
-        out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
-        out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
-        out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
-        out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
-        out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
-        out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
-    })
-
-    TILE(uint, 64, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 8, 1, in);
-    TILE(DATA_TYPE, 8, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v *= (DATA_TYPE) - 36.0f;
-    })
-
-    TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
-
-    com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
-    com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
-    com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
-    com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
-    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
-    com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
-    out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
-    out[1].s[0] = com[0].s[0] - com[0].s[1];
-    out[2].s[0] = com[0].s[0] + com[0].s[1];
-    out[3].s[0] = com[0].s[2] - com[0].s[3];
-    out[4].s[0] = com[0].s[2] + com[0].s[3];
-    out[5].s[0] = com[0].s[4] - com[0].s[5];
-    out[6].s[0] = com[0].s[4] + com[0].s[5];
-    out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
-
-    TILE(uint, 8, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, 1, in);
-    TILE(DATA_TYPE, 64, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 8, 8, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
-        com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
-        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
-        com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
-        com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
-    })
-
-    TILE(DATA_TYPE, 8, 8, tmp);
-    tmp[0].v = com[6].v;
-    tmp[1].v = com[0].v - com[1].v;
-    tmp[2].v = com[0].v + com[1].v;
-    tmp[3].v = com[2].v - com[3].v;
-    tmp[4].v = com[2].v + com[3].v;
-    tmp[5].v = com[4].v - com[5].v;
-    tmp[6].v = com[4].v + com[5].v;
-    tmp[7].v = com[7].v;
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[0]         = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[1]         = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
-        com[0].s[2]         = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[3]         = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
-        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[5]         = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
-        out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
-        out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
-        out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
-        out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
-        out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
-        out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
-        out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
-        out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
-    })
-
-    TILE(uint, 64, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
-
 #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
 /** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
  *
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
index 6a3e6d3346..861ed50651 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
@@ -176,181 +176,6 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
             (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x2_7x7_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-    int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    TILE(DATA_TYPE, 8, N0, in);
-    TILE(DATA_TYPE, 2, N0, out);
-    TILE(uint, 8, 1, src_indirect_y);
-
-    // Calculate the indirect Y for the source tensor
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 8 channels to compose the 8x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // Compute out0 and out01
-    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
-    out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
-#endif // defined(HAS_BIAS)
-
-    T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 2, 1, dst_indirect_y);
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 2,
-    {
-        int y_c              = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 2,
-    {
-        int x_c              = min(x_out + xk, ((int)_IDST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(DATA_TYPE, 16, N0, tmp);
-    TILE(uint, 64, 1, src_indirect_y);
-
-    // Calculate the indirect Y for the source tensor
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 64 channels to compose the 8x8 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        tmp[i * 2].v     = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
-        tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
-    })
-
-    // Compute the 2x2 output tile
-    LOOP_UNROLLING(int, i, 0, 1, 2,
-    {
-        out[i * 2].v     = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
-        out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
-    })
-
-#if defined(HAS_BIAS)
-    // Add bias
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // defined(HAS_BIAS)
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 2,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 2,
-        {
-            int x_c                       = min(x_out + xk, ((int)_IDST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 2].v = x_c + y_c * _IDST_WIDTH;
-            dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
 #if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -577,200 +402,6 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
 
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
- */
-__kernel void winograd_output_transform_4x4_3x3_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 6, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(uint, 6, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // Compute out00, out01, out02 and out03
-    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
-    out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
-    out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
-    out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
-        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 4,
-    {
-        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
-        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Calculate the indirect Y for the source tensor
-    TILE(DATA_TYPE, 36, N0, in);
-    TILE(DATA_TYPE, 4, N0, tmp);
-    TILE(uint, 36, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        tmp[0].v     = in[6 + i].v + in[12 + i].v;
-        tmp[1].v     = in[6 + i].v - in[12 + i].v;
-        tmp[2].v     = in[18 + i].v + in[24 + i].v;
-        tmp[3].v     = in[18 + i].v - in[24 + i].v;
-        tmp[3].v     = tmp[3].v + tmp[3].v;
-        in[i].v      = in[i].v + tmp[0].v + tmp[2].v;
-        in[6 + i].v  = tmp[3].v + tmp[1].v;
-        in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
-        in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
-    })
-
-    // Compute the output tile
-    TILE(DATA_TYPE, 16, N0, out);
-
-    LOOP_UNROLLING(int, i, 0, 1, 4,
-    {
-        tmp[0].v         = in[6 * i + 1].v + in[6 * i + 2].v;
-        tmp[1].v         = in[6 * i + 1].v - in[6 * i + 2].v;
-        tmp[2].v         = in[6 * i + 3].v + in[6 * i + 4].v;
-        tmp[3].v         = in[6 * i + 3].v - in[6 * i + 4].v;
-        tmp[3].v         = tmp[3].v + tmp[3].v;
-        out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
-        out[4 * i + 1].v = tmp[3].v + tmp[1].v;
-        out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
-        out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
-    })
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 16, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 4,
-        {
-            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
-            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
 #define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
     ({                                                                   \
         comm_fact.s0 = d1 + d2;                                          \
@@ -1023,214 +654,6 @@ __kernel void winograd_output_transform_4x4_5x5_nchw(
             0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
 #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 }
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_5x5_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    TILE(DATA_TYPE, 8, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(DATA_TYPE, 4, N0, tmp);
-    TILE(uint, 8, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-    // "in" contains 1x8 or 8x1 tile here
-    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // A^T * in, and in this degenerate case out consists of 1 column/row
-    tmp[0].v = in[1].v - in[2].v;
-    tmp[1].v = 2.0f * (in[3].v - in[4].v);
-    tmp[2].v = 2.0f * (in[5].v + in[6].v);
-    tmp[3].v = in[3].v + in[4].v;
-    out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
-    out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
-    out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
-    out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
-        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 4,
-    {
-        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
-        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Calculate the indirect Y for the source tensor
-    TILE(DATA_TYPE, 64, N0, in);
-    TILE(DATA_TYPE, 6, N0, tmp);
-    TILE(uint, 64, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // "in" here is 8x8 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // A^T * in
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        tmp[0].v = in[8 + i].v + in[16 + i].v;
-        tmp[1].v = in[8 + i].v - in[16 + i].v;
-        tmp[2].v = in[24 + i].v + in[32 + i].v;
-        tmp[3].v = in[24 + i].v - in[32 + i].v;
-        tmp[3].v = tmp[3].v + tmp[3].v;
-        tmp[4].v = in[40 + i].v + in[48 + i].v;
-        tmp[4].v = tmp[4].v + tmp[4].v;
-        tmp[5].v = in[40 + i].v - in[48 + i].v;
-
-        // 4x8 matrix as a result
-        in[i].v      = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
-        in[8 + i].v  = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
-        in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
-        in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
-    })
-
-    // Compute the output tile
-    TILE(DATA_TYPE, 16, N0, out);
-
-    // in * A, with in = A^T * in as above
-    LOOP_UNROLLING(int, i, 0, 1, 4,
-    {
-        tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
-        tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
-        tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
-        tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
-        tmp[3].v = tmp[3].v + tmp[3].v;
-        tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
-        tmp[4].v = tmp[4].v + tmp[4].v;
-        tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
-
-        // 4x4 tile
-        out[4 * i].v     = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
-        out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
-        out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
-        out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
-    })
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 16, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 4,
-        {
-            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
-            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 4
 
 #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
@@ -1303,73 +726,6 @@ __kernel void winograd_output_transform_2x1_3x1_nchw(
                                           );
 }
 
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x1_7x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
 #if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -1509,141 +865,6 @@ __kernel void winograd_output_transform_4x1_5x1_nchw(
                                           );
 }
 
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_3x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_5x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 4
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
 
@@ -1717,73 +938,6 @@ __kernel void winograd_output_transform_1x2_1x3_nchw(
                                           );
 }
 
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x2_1x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 2
 
 #if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -1923,141 +1077,6 @@ __kernel void winograd_output_transform_1x4_1x5_nchw(
                                           );
 }
 
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
 #endif // defined(VEC_SIZE) && VEC_SIZE == 4
 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
 #endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
index 1217a37345..a5334525fe 100644
--- a/src/core/CL/cl_kernels/space_to_depth.cl
+++ b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,12 +23,12 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Space to depth transformation. (NCHW)
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NHWC)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
  *
  * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -39,6 +39,12 @@
  * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
  * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
  * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
  * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -48,29 +54,39 @@
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void space_to_depth_nchw(
-    TENSOR4D_DECLARATION(input),
+__kernel void batch_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
     const int batch_id,
-    TENSOR3D_DECLARATION(output))
+    VECTOR_DECLARATION(block_shape),
+    TENSOR4D_DECLARATION(output))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
 
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % r;
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
 
-    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
-    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
+    const int w = batch_id % r;
 
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
 }
-/** Space to depth transformation. (NHWC)
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NHWC)
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
  *
  * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
@@ -90,22 +106,26 @@ __kernel void space_to_depth_nchw(
  * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
  */
-__kernel void space_to_depth_nhwc(
-    TENSOR4D_DECLARATION(input),
+__kernel void batch_to_space_static_nhwc(
+    TENSOR3D_DECLARATION(input),
     const int batch_id,
-    TENSOR3D_DECLARATION(output))
+    TENSOR4D_DECLARATION(output))
 {
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
 
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int r = (BATCH_SIZE / (block_x * block_y));
     const int x = get_global_id(1);
     const int y = get_global_id(2);
-    const int z = get_global_id(0) % r;
+    const int z = get_global_id(0);
+    const int w = batch_id % r;
 
-    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
-    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
 
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
 }
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-\ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
new file mode 100644
index 0000000000..cb2da1bd99
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_ptr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
+#ifndef USE_DEFAULT_BETA
+    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = 0;
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
+
+    res0 = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res0 = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
+    // beta is not zero, hence we need to perform the addition
+    res0 = ADD_OP(res0, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
index 63af2c6137..233beb3aa9 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
@@ -38,68 +38,6 @@
         mod_res = (x)-r;                                    \
     })
 
-/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
- *
- * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
- * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
- *       K is equal to num_channels / num_groups.
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
-                                   TENSOR4D_DECLARATION(dst))
-{
-    uint curr_channel = 0; // channel id of input
-    uint batch_id     = 0; // batch id
-    uint group_id     = 0; // group id
-    uint channel_id   = 0; // channel id within the group
-
-    // Compute curr_channel and batch_id
-    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
-
-    // Compute group_id and channel_id
-    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
-
-    const uint x = get_global_id(0) * VEC_SIZE;
-    const uint y = get_global_id(1) * 2;
-    const uint z = channel_id * NUM_GROUPS + group_id;
-
-    // Load the Nx2 block
-    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-
-    // Store blocks
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
-    VSTORE(VEC_SIZE)
-    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
-    VSTORE(VEC_SIZE)
-    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
-}
-
 #if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
 
 /** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
@@ -219,4 +157,4 @@ __kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
     STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
 #endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
new file mode 100644
index 0000000000..5464a4bef8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
new file mode 100644
index 0000000000..238d3a7921
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nhwc(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index 75a7a0f004..75a7a0f004 100644
--- a/src/core/CL/cl_kernels/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index 1f940001f3..d2e7e45ada 100644
--- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -125,7 +125,7 @@ __kernel void dwc_native_fp_nhwc(
     const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
 #else                                                         // defined(BATCHED_EXECUTION)
     const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
-    const int bout = 0; // BATCH SIZE IDX
+    const int bout = 0;                        // BATCH SIZE IDX
 #endif                                                        // defined(BATCHED_EXECUTION)
 
     int xi = xo * STRIDE_X;
@@ -148,37 +148,37 @@ __kernel void dwc_native_fp_nhwc(
 
 #if _IWEI_HEIGHT <= 5
         LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
-#else // _IWEI_HEIGHT <= 5
+#else  // _IWEI_HEIGHT <= 5
         for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
 #endif // _IWEI_HEIGHT <= 5
-        {
-            TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+                       {
+                           TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
 
-            LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
-            {
-                a[i].v = 0;
-            })
+                           LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+        {
+            a[i].v = 0;
+        })
 
-            // Load tile from the src tensor (TILE A)
-            T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+        // Load tile from the src tensor (TILE A)
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
 
-            TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+        TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
 
-            // Load tile from the weights tensor (TILE B)
-            T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b);
+        // Load tile from the weights tensor (TILE B)
+        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b);
 
-            // Optimized path for STRIDE_X == 1
-            // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
+        // Optimized path for STRIDE_X == 1
+        // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
             {
-                LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-                {
-                    c[m0].v += a[xk + m0].v * b[xk].v;
-                })
+                c[m0].v += a[xk + m0].v *b[xk].v;
             })
-        }
+        })
+                       }
 #if _IWEI_HEIGHT <= 5
-        )
+                      )
 #endif // _IWEI_HEIGHT <= 5
 
 #if defined(HAS_BIAS)
diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index aa6ba4de39..1bc58b6e26 100644
--- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -179,61 +179,61 @@ __kernel void dwc_native_quantized_nhwc(
 
 #if _IWEI_HEIGHT <= 5
         LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
-#else // _IWEI_HEIGHT <= 5
+#else  // _IWEI_HEIGHT <= 5
         for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
 #endif // _IWEI_HEIGHT <= 5
-        {
-            TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+                       {
+                           TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
 
-            LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
-            {
-                a[i].v = ZERO_VALUE;
-            })
+                           LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+        {
+            a[i].v = ZERO_VALUE;
+        })
 
-            // Load tile from the src tensor (TILE A)
-            T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+        // Load tile from the src tensor (TILE A)
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
 
-            TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+        TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
 
-            // Load tile from the weights tensor (TILE B)
-            T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b);
+        // Load tile from the weights tensor (TILE B)
+        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b);
 
-            // Optimized path for STRIDE_X == 1
-            // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
+        // Optimized path for STRIDE_X == 1
+        // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
             {
-                LOOP_UNROLLING(int, n0, 0, 1, N0,
-                {
 #if _IWEI_WIDTH <= 16
 #define DOT_DATA_TYPE SRC_DATA_TYPE
 #define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
 
-                    // Optimized path for the dot instruction
-                    TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
-                    TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
-                    ACC_DATA_TYPE offset_a = 0;
-                    ACC_DATA_TYPE offset_b = 0;
+                // Optimized path for the dot instruction
+                TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
+                TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
+                ACC_DATA_TYPE offset_a = 0;
+                ACC_DATA_TYPE offset_b = 0;
 
-                    LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-                    {
-                        x0[0].s[xk] = a[xk + m0].s[n0];
-                        y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
-                    })
-                    DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
-                    REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
-                    REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
-                    c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
+                LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+                {
+                    x0[0].s[xk] = a[xk + m0].s[n0];
+                    y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
+                })
+                DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
+                REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
+                REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
+                c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
 #else  // _IWEI_WIDTH <= 16
-                    LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-                    {
-                        c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
-                    })
-#endif // _IWEI_WIDTH <= 16
+                LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+                {
+                    c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
                 })
+#endif // _IWEI_WIDTH <= 16
             })
-        }
+        })
+                       }
 #if _IWEI_HEIGHT <= 5
-        )
+                      )
 #endif // _IWEI_HEIGHT <= 5
 
 #if _IWEI_WIDTH <= 16
diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl
new file mode 100644
index 0000000000..ac00c11283
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/im2col.cl
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
+
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
+
+/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
+ *  @name IM2COL1X9_NHWC_STORE
+ *
+ *  @note To use this macro for a 3x3 block, @p ROW has to be 0
+ *
+ * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
+ * @param[in] DATA_TYPE            Data type of @p DATA
+ * @param[in] SRC_DEPTH            Input channel size / depth
+ * @param[in] DATA                 Value variable base name
+ * @param[in] ROW                  The row number to store. Supported: 0-8
+ * @param[in] OUTPUT_PTR           Output pointer
+ * @{
+ */
+#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
+    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
+    if(at_channel_boundary)                                                                                          \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    }                                                                                                                \
+    else                                                                                                             \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
+    }
+#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
+#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+
+#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+
+#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+/** @}*/
+
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int3 offset   = 0;
+
+    // Clamp xi
+    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
+    xi_offset *= (int3)src_stride_y;
+
+    // Out-of-bound condition for X
+    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+    // yi == 0
+    // Clamp yi
+    // yi_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
+    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
+    // also causes y_cond (y padding condition) to be satisfied
+    yi_coord = yi - (int)PAD_TOP;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 1
+    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with zeros
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 2
+    // Clamp yi_coord
+    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // Store in a boundary-aware way to avoid padding
+    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
+        \
+        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
+        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
+        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
+        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
+        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
+        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
+        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
+        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
+        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
+        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
+        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+    })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
+        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+    })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int8 offset0  = 0;
+    int  offset1  = 0;
+
+    // Clamp xi
+    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    xi_offset0 *= (int8)src_stride_y;
+    xi_offset1 *= (int)src_stride_y;
+
+    // Out-of-bound condition for X
+    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+    IM2COL1x9(0);
+    IM2COL1x9(1);
+    IM2COL1x9(2);
+    IM2COL1x9(3);
+    IM2COL1x9(4);
+    IM2COL1x9(5);
+    IM2COL1x9(6);
+    IM2COL1x9(7);
+    IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int i = 0;
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        // Clamp yi_coord
+        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+        yi_coord     = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+
+        // Out-of-bound condition for Y
+        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
+
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+        {
+            // Clamp xi_coord
+            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+            xi_coord     = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+            // Out-of-bound condition for X
+            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+            // Replace with PAD_VALUE if the value is out-of-bound
+            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+            // Store in a boundary-aware way to avoid padding
+#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            const bool at_channel_boundary = get_global_id(0) == 0;
+            if(at_channel_boundary)
+            {
+                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            else // at_channel_boundary
+#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            {
+                VSTORE(VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            i++;
+        }
+    }
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
index 4569208824..7e35e161c8 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
@@ -30,69 +30,6 @@
 #define POW_OP(x, y) pow((x), (y))
 #define SQCVT_SAT(a) (a)
 
-#if defined(NUM_SLICES)
-/** Apply cross-map normalization.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
-                                                 TENSOR3D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
-
-    const int current_slice = get_global_id(2);
-    const int left_slice    = max(-(int)RADIUS, -current_slice);
-    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
-
-    for(int i = left_slice; i <= right_slice; i++)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
-        acc    = ADD_OP(acc, MUL_OP(values, values));
-    }
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
-
-    VSTORE(VEC_SIZE)
-    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif /* defined(NUM_SLICES) */
-
 #if defined(WIDTH_SIZE)
 /** Apply cross-map normalization.
  *
@@ -156,85 +93,6 @@ __kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input),
 
     STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-
-/** Apply in-map normalization when tensors are in the NCHW data layout format.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
-                                              TENSOR3D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = 0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = SQCVT_SAT(KAPPA);
-
-    const int current_col = get_global_id(0) << 2;
-    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
-    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
-
-#if defined(IN_MAP_2D)
-    const int current_row = get_global_id(1);
-    const int first_row   = max(-(int)RADIUS, -current_row);
-    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
-#endif /* defined(IN_MAP_2D) */
-
-#if defined(IN_MAP_2D)
-    for(int j = first_row; j <= last_row; ++j)
-    {
-#endif /* defined(IN_MAP_2D) */
-        for(int i = left_pos; i <= right_pos; ++i)
-        {
-#if defined(IN_MAP_2D)
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
-#else  /* defined(IN_MAP_2D) */
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
-#endif /* defined(IN_MAP_2D) */
-            acc = ADD_OP(acc, MUL_OP(values, values));
-        }
-#if defined(IN_MAP_2D)
-    }
-#endif /* defined(IN_MAP_2D) */
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
-
-    VSTORE(VEC_SIZE)
-    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
 #endif // defined(WIDTH_SIZE)
 
 #if defined(NUM_SLICES) && defined(DIM1_SIZE)
@@ -267,9 +125,9 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
                                               TENSOR3D_DECLARATION(output))
 {
     // Offset computation
-    const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
-    const int current_cols = get_global_id(1);
-    const int current_rows = get_global_id(2);
+    const uint x_offs       = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    const int  current_cols = get_global_id(1);
+    const int  current_rows = get_global_id(2);
 
     // Address computation
     __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
@@ -284,8 +142,8 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     kappa_v = SQCVT_SAT(KAPPA);
 
-    const int first_col    = max(0, current_cols - (int)RADIUS);
-    const int last_col     = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
+    const int first_col = max(0, current_cols - (int)RADIUS);
+    const int last_col  = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
 
 #if defined(IN_MAP_2D)
     const int first_row = max(0, current_rows - (int)RADIUS);
@@ -312,7 +170,7 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
     normalized = POW_OP(acc, beta_v);
     const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows * output_stride_z)), normalized);
+    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows *output_stride_z)), normalized);
 
     STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
index 0a098356b4..86c33499e2 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
@@ -27,59 +27,6 @@
 
 #define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
 
-/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
-                                              TENSOR3D_DECLARATION(dst),
-                                              VECTOR_DECLARATION(mean),
-                                              VECTOR_DECLARATION(std))
-{
-    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
-
-    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
-
-    const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
-    const DATA_TYPE curr_std  = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
-
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
-    TYPE res  = (data - curr_mean) / curr_std;
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
 /** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
  *
  * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -131,4 +78,4 @@ __kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
 
     STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
index d660fffb58..7bc3c15a63 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
@@ -29,76 +29,6 @@
 #define OFFSET_FLT ((float)OFFSET)
 #define SCALE_FLT ((float)SCALE)
 
-#if defined(NUM_CHANNELS)
-
-/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
- * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
- * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
-                                                 TENSOR3D_DECLARATION(dst),
-                                                 VECTOR_DECLARATION(mean),
-                                                 VECTOR_DECLARATION(std))
-{
-    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
-
-    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
-    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
-    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
-    data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
-
-    // Perform normalization
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
-
-    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
-    VSTORE(VEC_SIZE)
-    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-#endif // defined(NUM_CHANNELS)
-
 /** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
  *
  * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -163,4 +93,4 @@ __kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
     const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
     STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
 }
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
new file mode 100644
index 0000000000..5b59ff5088
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
+#else  //DST_BATCH_SIZE != 1
+    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
+    int idx_out_n   = 0;
+#endif // DST_BATCH_SIZE != 1
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(EXCLUDE_PADDING)
+    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#else  // defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    // Global pooling path
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+#pragma unroll 8
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+#pragma unroll 8
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+            data0;
+#if defined(FP_MIXED_PRECISION)
+            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif // defined(POOL_L2)
+            res0 = POOL_OP(res0, data0);
+        }
+    }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+
+/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2x2_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output)
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+    ,
+    TENSOR4D_DECLARATION(indices)
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+)
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else  //SRC_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // SRC_BATCH_SIZE != 1
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
+
+    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+
+    int x0 = pool_x_s + idx_in_w;
+    int y0 = pool_y_s + idx_in_h;
+    int x1 = pool_x_e - 1 + idx_in_w;
+    int y1 = pool_y_e - 1 + idx_in_h;
+
+    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
+
+#if defined(FP_MIXED_PRECISION)
+    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if !defined(POOL_MAX)
+    if(filter_size != 4)
+    {
+        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
+        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
+        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
+        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
+
+        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
+        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
+        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
+        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
+    }
+#endif // !defined(POOL_MAX)
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 *= data0;
+    data1 *= data1;
+    data2 *= data2;
+    data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = data0;
+    res0 = POOL_OP(res0, data1);
+    res0 = POOL_OP(res0, data2);
+    res0 = POOL_OP(res0, data3);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#if defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#else  // !defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
+#endif // defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+
+    // This part is used to return the index of the maximum value
+    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
+
+    // note: Batch dimension does not contribute in the offset contribution
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    base_index = (uint)idx_out_c;
+
+    base_index += VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+
+    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
+
+    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
+                                           indices_stride_w;
+
+    // Store result
+    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
index d8cef2b4e6..46268a4a88 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,108 +51,6 @@
 #error "L2 pooling is not supported"
 #endif /* defined(POOL_L2) */
 
-int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = get_global_id(0) * stride_x - pad_x;
-    int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NCHW)
- *
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_MxN_quantized_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int8 vdata = INITIAL_VALUE;
-    int  sdata = INITIAL_VALUE;
-
-    // Load data
-    for(int y = 0; y < POOL_SIZE_Y; y++)
-    {
-        int x = 0;
-        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
-        {
-            VEC_TYPE(8)
-            data       = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            int8 data0 = convert_int8(data);
-            vdata      = POOL_OP(vdata, data0);
-        }
-
-        // Leftover
-        for(; x < (int)POOL_SIZE_X; ++x)
-        {
-            DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            int data0      = convert_int(data);
-            sdata          = POOL_OP(sdata, data0);
-        }
-    }
-
-    // Reduce result
-    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
-    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
-    res          = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG)
-    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
-#endif /* defined(POOL_AVG) */
-
-    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-
-    const float result_f32   = convert_float(result_q8);
-    const float input_offset = (float)OFFSET_IN1;
-    const float input_scale  = (float)SCALE_IN1;
-    const float scale_out    = (float)SCALE_OUT;
-    const float offset_out   = (float)OFFSET_OUT;
-    const float in_f32       = (result_f32 - input_offset) * input_scale;
-    const float out_f32      = in_f32 / scale_out + offset_out;
-    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
-
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-    *(__global DATA_TYPE *)output.ptr = result_q8;
-}
-
 #if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
 /** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
  * -# max, -DPOOL_MAX must be passed at compile time
diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/nhwc/remap.cl
index cb67c2df1e..0b629fe6c9 100644
--- a/src/core/CL/cl_kernels/remap.cl
+++ b/src/core/CL/cl_kernels/nhwc/remap.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, 2021 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,113 +24,7 @@
 #include "helpers.h"
 #include "warp_helpers.h"
 
-#ifndef DEPTH_OUT
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_nearest_neighbour_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_bilinear_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
-}
-#else // DEPTH_OUT
+#ifdef DEPTH_OUT
 /** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
  *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
  *
@@ -283,4 +177,4 @@ __kernel void remap_bilinear_nhwc(
     *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
 }
 
-#endif // DEPTH_OUT
+#endif // DEPTH_OUT
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/reorg_layer.cl b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
new file mode 100644
index 0000000000..a340b0b8a2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
+    ({                                                        \
+        int offset = zo / (int)SRC_DEPTH;                     \
+        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
+        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
+        zi         = zo % SRC_DEPTH;                          \
+    })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(1);
+    int yo = get_global_id(2);
+    int zo = get_global_id(0);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index d4c27e6cf6..1ea5e73df1 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,129 +24,6 @@
 #include "helpers.h"
 #include "warp_helpers.h"
 
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_nearest(const float2 coord, const float2 scale)
-{
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = in_x_coords * (float4)(scale.s0);
-    const float4 new_y       = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
-    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_nearest_neighbour_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r           = (float2)(scale_x, scale_y);
-    float8       transformed = transform_nearest(get_current_coords(), r);
-#ifdef ALIGN_CORNERS
-    transformed = round(transformed);
-#endif // ALIGN_CORNERS
-    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
-    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear(get_current_coords(), r);
-    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
-}
-
 #if defined(DEPTH_OUT)
 /** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
  *
@@ -262,22 +139,22 @@ __kernel void scale_bilinear_nhwc(
     const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
 
 #ifndef BORDER_MODE_REPLICATE
-    const bool  check_x = (0.f <= new_xf && new_xf < input_width);
+    const bool  check_x  = (0.f <= new_xf && new_xf < input_width);
     const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
-    const bool  check_y = (0.f <= new_yf && new_yf < input_height);
+    const bool  check_y  = (0.f <= new_yf && new_yf < input_height);
     const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
-    const float ins_0   = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                                          (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y);
+    const float ins_0    = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+                                                                                                           (get_global_id(2) / DEPTH_OUT)))),
+                                  check_x && check_y);
     const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
                                                                                                         (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y);
+                               check_x1 && check_y);
     const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
                                                                                                         (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y1);
+                               check_x && check_y1);
     const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
                                                                                                         (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y1);
+                               check_x1 && check_y1);
     float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3);
 #else  /* BORDER_MODE_REPLICATE */
     float4 ins        = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
index 010e4ed57a..de9bb607b0 100644
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,67 +24,6 @@
 #include "helpers_asymm.h"
 #include "warp_helpers_quantized.h"
 
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_quantized_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
-    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
-}
-
 #if defined(DEPTH_OUT)
 /** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
  *
@@ -146,22 +85,22 @@ __kernel void scale_bilinear_quantized_nhwc(
     const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
 
 #ifndef BORDER_MODE_REPLICATE
-    const bool  check_x = (0.f <= new_xf && new_xf < input_width);
-    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
-    const bool  check_y = (0.f <= new_yf && new_yf < input_height);
-    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
-    const int ins_0    = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                                     (get_global_id(2) / DEPTH_OUT)))),
+    const bool check_x  = (0.f <= new_xf && new_xf < input_width);
+    const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
+    const bool check_y  = (0.f <= new_yf && new_yf < input_height);
+    const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
+    const int ins_0     = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+                                                                                                      (get_global_id(2) / DEPTH_OUT)))),
                                  check_x && check_y);
     const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
                                                                                                   (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y);
+                             check_x1 && check_y);
     const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
                                                                                                   (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y1);
+                             check_x && check_y1);
     const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
                                                                                                   (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y1);
+                             check_x1 && check_y1);
     int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3);
 #else  /* BORDER_MODE_REPLICATE */
     int4 ins          = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_batch.cl b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
new file mode 100644
index 0000000000..785206e3b9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nhwc(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
new file mode 100644
index 0000000000..d44e78d990
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
index d0cc0f24b7..74b9674a88 100644
--- a/src/core/CL/cl_kernels/upsample_layer.cl
+++ b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,61 +23,6 @@
  */
 #include "helpers.h"
 
-/** This function applies upsample on an input image. (NCHW)
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: All
- * -# -DVEC_SIZE_IN = Input vector size
- * -# -DVEC_SIZE_OUT = Output vector size
- * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
- * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void upsample_layer_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
-    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
-    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
-    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data = vload8(0, (__global DATA_TYPE *)src.ptr);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
-
-    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
-    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
-#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
-#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
-
 /** This function applies upsample on an input image. (NHWC)
  *
  * @attention The following variables must be passed at compile time:
@@ -132,4 +77,4 @@ __kernel void upsample_layer_nhwc(
     *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
     *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
 #endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
+}
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
index 5c3bb8aa9b..8d5fd3437f 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,288 +37,6 @@
         out.s7 = tmp.s6;                                                                                                           \
     })
 
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x2_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0    = 0.0f;
-    out0.s0 = (w0.s0);
-    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
-    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
-    out0.s3 = (w0.s2);
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out1    = 0.0f;
-    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
-    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out2    = 0.0f;
-    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
-    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
-    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
-    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out3    = 0.0f;
-    out3.s0 = (w2.s0);
-    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
-    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
-    out3.s3 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 16 channels for 3x3 kernels
-    // 4 channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = (w0.s0) / 16.f;
-    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
-    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
-    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s5 = (w0.s2) / 4.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
-    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (w2.s0) / 4.f;
-    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
-    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
-    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s5 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 36 channels for 3x3 kernels
-    // 6 channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
 /** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -477,308 +195,6 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
 #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
 }
 
-/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- *
- * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_5x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Transform the input tile
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = w00.s0;
-    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
-    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
-    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
-    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
-    out0.s7 = w01;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
-    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
-                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
-                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
-    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
-                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
-                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
-    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
-
-    // Row 6
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out6    = 0.0f;
-    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
-    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
-
-    // Row 7
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out7    = 0.0f;
-    out7.s0 = w40.s0;
-    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
-    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
-    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
-    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
-    out7.s7 = w41;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
 /** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -1360,152 +776,6 @@ __kernel void winograd_filter_transform_2x2_7x7_nhwc(
 #endif // defined(SRC_DIM_Z)
 
 #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_5x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
 
 /** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
  *
@@ -1656,153 +926,6 @@ __kernel void winograd_filter_transform_2x1_7x1_nhwc(
 #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
 
 #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x2_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
 /** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
  *
  * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -1949,4 +1072,4 @@ __kernel void winograd_filter_transform_1x2_1x7_nhwc(
                                            dst_step_z,
                                            dst_offset_first_element_in_bytes);
 }
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
new file mode 100644
index 0000000000..4865982a55
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
+    ({                                                              \
+        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
+        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
+        comm_fact.s2 = 2.5f * tmp.s3;                               \
+        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
+        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
+        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+    })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
+    ({                                                                                             \
+        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
+        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
+        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
+        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
+        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
+        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
+        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
+        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+    })
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+
+#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 6, 1, in);
+    TILE(DATA_TYPE, 6, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    TILE(DATA_TYPE, 6, 1, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v *= 4.0f;
+    })
+
+    com[0].v = in[2].v - 4.f * in[0].v;
+    com[1].v = in[3].v - 4.f * in[1].v;
+    com[2].v = in[4].v - 4.f * in[2].v;
+    com[3].v = in[5].v - 4.f * in[3].v;
+    com[4].v = in[3].v - in[1].v;
+    com[4].v = com[4].v + com[4].v;
+    com[5].v = in[4].v - in[2].v;
+
+    out[0].v = com[2].v - com[0].v;
+    out[1].v = com[2].v + com[1].v;
+    out[2].v = com[2].v - com[1].v;
+    out[3].v = com[5].v + com[4].v;
+    out[4].v = com[5].v - com[4].v;
+    out[5].v = com[3].v - com[1].v;
+
+    TILE(uint, 6, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 36, 1, in);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 6, 1, com);
+    TILE(DATA_TYPE, 36, 1, tmp);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        com[0].v         = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
+        com[1].v         = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
+        com[2].v         = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
+        com[3].v         = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
+        com[4].v         = in[3 * 6 + i].v - in[1 * 6 + i].v;
+        com[4].v         = com[4].v + com[4].v;
+        com[5].v         = in[4 * 6 + i].v - in[2 * 6 + i].v;
+        tmp[i + 0 * 6].v = com[2].v - com[0].v;
+        tmp[i + 1 * 6].v = com[2].v + com[1].v;
+        tmp[i + 2 * 6].v = com[2].v - com[1].v;
+        tmp[i + 3 * 6].v = com[5].v + com[4].v;
+        tmp[i + 4 * 6].v = com[5].v - com[4].v;
+        tmp[i + 5 * 6].v = com[3].v - com[1].v;
+    })
+
+    TILE(DATA_TYPE, 36, 1, out);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        com[0].v         = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v;
+        com[1].v         = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v;
+        com[2].v         = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v;
+        com[3].v         = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v;
+        com[4].v         = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
+        com[4].v         = com[4].v + com[4].v;
+        com[5].v         = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
+        out[i * 6 + 0].v = com[2].v - com[0].v;
+        out[i * 6 + 1].v = com[2].v + com[1].v;
+        out[i * 6 + 2].v = com[2].v - com[1].v;
+        out[i * 6 + 3].v = com[5].v + com[4].v;
+        out[i * 6 + 4].v = com[5].v - com[4].v;
+        out[i * 6 + 5].v = com[3].v - com[1].v;
+    })
+
+    // Compute destination address
+    TILE(uint, 36, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 8, 1, in);
+    TILE(DATA_TYPE, 8, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    TILE(DATA_TYPE, 1, 8, com);
+
+    com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
+    com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
+    com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
+    com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
+    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
+    com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
+    out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
+    out[1].s[0] = com[0].s[0] + com[0].s[1];
+    out[2].s[0] = com[0].s[0] - com[0].s[1];
+    out[3].s[0] = com[0].s[3] + com[0].s[2];
+    out[4].s[0] = com[0].s[3] - com[0].s[2];
+    out[5].s[0] = com[0].s[4] + com[0].s[5];
+    out[6].s[0] = com[0].s[4] - com[0].s[5];
+    out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
+
+    TILE(uint, 8, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, 1, in);
+    TILE(DATA_TYPE, 64, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 8, 8, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                                    // x
+        com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];                                    // x
+        com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                 // x
+        com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
+        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
+        com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
+        com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
+    })
+
+    TILE(DATA_TYPE, 8, 8, tmp);
+    tmp[0].v = com[6].v;
+    tmp[1].v = com[0].v + com[1].v;
+    tmp[2].v = com[0].v - com[1].v;
+    tmp[3].v = com[2].v + com[3].v;
+    tmp[4].v = com[2].v - com[3].v;
+    tmp[5].v = com[4].v + com[5].v;
+    tmp[6].v = com[4].v - com[5].v;
+    tmp[7].v = com[7].v;
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[0]         = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[1]         = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
+        com[0].s[2]         = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
+        com[0].s[3]         = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[5]         = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
+        out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
+        out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
+        out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
+        out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
+        out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
+        out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
+        out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
+        out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
+    })
+
+    TILE(uint, 64, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 8, 1, in);
+    TILE(DATA_TYPE, 8, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v *= (DATA_TYPE) - 36.0f;
+    })
+
+    TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
+
+    com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
+    com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
+    com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
+    com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
+    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
+    com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
+    out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
+    out[1].s[0] = com[0].s[0] - com[0].s[1];
+    out[2].s[0] = com[0].s[0] + com[0].s[1];
+    out[3].s[0] = com[0].s[2] - com[0].s[3];
+    out[4].s[0] = com[0].s[2] + com[0].s[3];
+    out[5].s[0] = com[0].s[4] - com[0].s[5];
+    out[6].s[0] = com[0].s[4] + com[0].s[5];
+    out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
+
+    TILE(uint, 8, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, 1, in);
+    TILE(DATA_TYPE, 64, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 8, 8, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
+        com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
+        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
+        com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
+        com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
+    })
+
+    TILE(DATA_TYPE, 8, 8, tmp);
+    tmp[0].v = com[6].v;
+    tmp[1].v = com[0].v - com[1].v;
+    tmp[2].v = com[0].v + com[1].v;
+    tmp[3].v = com[2].v - com[3].v;
+    tmp[4].v = com[2].v + com[3].v;
+    tmp[5].v = com[4].v - com[5].v;
+    tmp[6].v = com[4].v + com[5].v;
+    tmp[7].v = com[7].v;
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[0]         = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[1]         = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
+        com[0].s[2]         = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[3]         = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
+        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[5]         = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
+        out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
+        out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
+        out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
+        out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
+        out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
+        out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
+        out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
+        out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
+    })
+
+    TILE(uint, 64, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
new file mode 100644
index 0000000000..0fcd04e713
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
@@ -0,0 +1,1030 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
+
+    int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    TILE(DATA_TYPE, 8, N0, in);
+    TILE(DATA_TYPE, 2, N0, out);
+    TILE(uint, 8, 1, src_indirect_y);
+
+    // Calculate the indirect Y for the source tensor
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 8 channels to compose the 8x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // Compute out0 and out01
+    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
+    out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+    T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 2, 1, dst_indirect_y);
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 2,
+    {
+        int y_c              = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 2,
+    {
+        int x_c              = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(DATA_TYPE, 16, N0, tmp);
+    TILE(uint, 64, 1, src_indirect_y);
+
+    // Calculate the indirect Y for the source tensor
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 64 channels to compose the 8x8 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        tmp[i * 2].v     = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
+        tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
+    })
+
+    // Compute the 2x2 output tile
+    LOOP_UNROLLING(int, i, 0, 1, 2,
+    {
+        out[i * 2].v     = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
+        out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
+    })
+
+#if defined(HAS_BIAS)
+    // Add bias
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 2,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 2,
+        {
+            int x_c                       = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
+            dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
+ */
+__kernel void winograd_output_transform_4x4_3x3_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 6, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(uint, 6, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // Compute out00, out01, out02 and out03
+    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
+    out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
+    out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
+    out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 4,
+    {
+        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Calculate the indirect Y for the source tensor
+    TILE(DATA_TYPE, 36, N0, in);
+    TILE(DATA_TYPE, 4, N0, tmp);
+    TILE(uint, 36, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        tmp[0].v     = in[6 + i].v + in[12 + i].v;
+        tmp[1].v     = in[6 + i].v - in[12 + i].v;
+        tmp[2].v     = in[18 + i].v + in[24 + i].v;
+        tmp[3].v     = in[18 + i].v - in[24 + i].v;
+        tmp[3].v     = tmp[3].v + tmp[3].v;
+        in[i].v      = in[i].v + tmp[0].v + tmp[2].v;
+        in[6 + i].v  = tmp[3].v + tmp[1].v;
+        in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+        in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
+    })
+
+    // Compute the output tile
+    TILE(DATA_TYPE, 16, N0, out);
+
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        tmp[0].v         = in[6 * i + 1].v + in[6 * i + 2].v;
+        tmp[1].v         = in[6 * i + 1].v - in[6 * i + 2].v;
+        tmp[2].v         = in[6 * i + 3].v + in[6 * i + 4].v;
+        tmp[3].v         = in[6 * i + 3].v - in[6 * i + 4].v;
+        tmp[3].v         = tmp[3].v + tmp[3].v;
+        out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
+        out[4 * i + 1].v = tmp[3].v + tmp[1].v;
+        out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+        out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
+    })
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 16, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 4,
+        {
+            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    TILE(DATA_TYPE, 8, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(DATA_TYPE, 4, N0, tmp);
+    TILE(uint, 8, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+    // "in" contains 1x8 or 8x1 tile here
+    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // A^T * in, and in this degenerate case out consists of 1 column/row
+    tmp[0].v = in[1].v - in[2].v;
+    tmp[1].v = 2.0f * (in[3].v - in[4].v);
+    tmp[2].v = 2.0f * (in[5].v + in[6].v);
+    tmp[3].v = in[3].v + in[4].v;
+    out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
+    out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
+    out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
+    out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 4,
+    {
+        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Calculate the indirect Y for the source tensor
+    TILE(DATA_TYPE, 64, N0, in);
+    TILE(DATA_TYPE, 6, N0, tmp);
+    TILE(uint, 64, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // "in" here is 8x8 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // A^T * in
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        tmp[0].v = in[8 + i].v + in[16 + i].v;
+        tmp[1].v = in[8 + i].v - in[16 + i].v;
+        tmp[2].v = in[24 + i].v + in[32 + i].v;
+        tmp[3].v = in[24 + i].v - in[32 + i].v;
+        tmp[3].v = tmp[3].v + tmp[3].v;
+        tmp[4].v = in[40 + i].v + in[48 + i].v;
+        tmp[4].v = tmp[4].v + tmp[4].v;
+        tmp[5].v = in[40 + i].v - in[48 + i].v;
+
+        // 4x8 matrix as a result
+        in[i].v      = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+        in[8 + i].v  = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+        in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
+        in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
+    })
+
+    // Compute the output tile
+    TILE(DATA_TYPE, 16, N0, out);
+
+    // in * A, with in = A^T * in as above
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
+        tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
+        tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
+        tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
+        tmp[3].v = tmp[3].v + tmp[3].v;
+        tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
+        tmp[4].v = tmp[4].v + tmp[4].v;
+        tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
+
+        // 4x4 tile
+        out[4 * i].v     = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+        out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+        out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
+        out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
+    })
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 16, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 4,
+        {
+            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
deleted file mode 100644
index d63a2e51e8..0000000000
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "repeat.h"
-#include "tile_helpers.h"
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define POOL_OP(x, y) ((x) + (y))
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define POOL_OP(x, y) (fmax((x), (y)))
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) ((x) * (x))
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-#define SQRT_OP(x) sqrt((x))
-
-#if STRIDE_X == 1
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
-#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
-#endif /* STRIDE_X == 3 */
-
-#if defined(FP_MIXED_PRECISION)
-#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
-    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
-#else /* defined(FP_MIXED_PRECISION) */
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
-#endif /* defined(FP_MIXED_PRECISION) */
-
-#define POOLING3x3_STRIDE1(res, input, output)                                                                                                       \
-    ({                                                                                                                                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                               \
-        data00 = POW2_OP(data00, 4);                                                                                                                 \
-        data01 = POW2_OP(data01, 2);                                                                                                                 \
-        data10 = POW2_OP(data10, 4);                                                                                                                 \
-        data11 = POW2_OP(data11, 2);                                                                                                                 \
-        data20 = POW2_OP(data20, 4);                                                                                                                 \
-        data21 = POW2_OP(data21, 2);                                                                                                                 \
-        \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                              \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                                      \
-        values01 = POOL_OP(values01, values11);                                                                                                      \
-        values00 = POOL_OP(values00, values20);                                                                                                      \
-        values01 = POOL_OP(values01, values21);                                                                                                      \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
-    })
-
-#define POOLING3x3_STRIDE2(res, input, output)                                                                                                       \
-    ({                                                                                                                                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data00               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                     \
-        ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8));                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data10               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                     \
-        ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8));                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data20               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                     \
-        ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8));                                       \
-        data00               = POW2_OP(data00, 8);                                                                                                   \
-        data01               = POW2_OP(data01, 1);                                                                                                   \
-        data10               = POW2_OP(data10, 8);                                                                                                   \
-        data11               = POW2_OP(data11, 1);                                                                                                   \
-        data20               = POW2_OP(data20, 8);                                                                                                   \
-        data21               = POW2_OP(data21, 1);                                                                                                   \
-        \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01);                                                                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11);                                                                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21);                                                                           \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                                      \
-        values01 = POOL_OP(values01, values11);                                                                                                      \
-        values00 = POOL_OP(values00, values20);                                                                                                      \
-        values01 = POOL_OP(values01, values21);                                                                                                      \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
-    })
-
-#define POOLING3x3_STRIDE3(res, input, output)                                                                                               \
-    ({                                                                                                                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                       \
-        data00 = POW2_OP(data00, 8);                                                                                                         \
-        data01 = POW2_OP(data01, 4);                                                                                                         \
-        data10 = POW2_OP(data10, 8);                                                                                                         \
-        data11 = POW2_OP(data11, 4);                                                                                                         \
-        data20 = POW2_OP(data20, 8);                                                                                                         \
-        data21 = POW2_OP(data21, 4);                                                                                                         \
-        \
-        data00 = POOL_OP(data00, data10);                                                                                                    \
-        data01 = POOL_OP(data01, data11);                                                                                                    \
-        data00 = POOL_OP(data00, data20);                                                                                                    \
-        data01 = POOL_OP(data01, data21);                                                                                                    \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03));                                                       \
-    })
-
-ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = get_global_id(0) * stride_x - pad_x;
-    int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_2(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 2);
-    data1 = POW2_OP(data1, 2);
-#endif /* defined(POOL_L2) */
-
-    // Perform calculations
-    data0             = POOL_OP(data0, data1);
-    ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average or l2 pooling
-    res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-/** Performs a pooling function of pool size equal to 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_3(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 3);
-    data1 = POW2_OP(data1, 3);
-    data2 = POW2_OP(data2, 3);
-#endif /* defined(POOL_L2) */
-
-    // Perform calculations
-    data0             = POOL_OP(data0, data1);
-    data0             = POOL_OP(data0, data2);
-    ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-#if defined(POOLING3x3)
-
-#define CONVERT_OP(data_type) convert_##data_type##4
-#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
-
-VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                     const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int4       start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
-    int        start_y = get_global_id(1) * stride_y - pad_y;
-    const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
-    const int  end_y   = min(start_y + pool_size, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max((int4)0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
-}
-
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_optimized_3(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    res;
-
-    // Perform pooling 3x3 for 4 output elements
-    POOLING3x3(res, input, output);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(POOLING3x3)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-/** Performs a pooling function of pool size equal to N  (NCHW)
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-    vdata               = INITIAL_VALUE;
-    ACC_DATA_TYPE sdata = INITIAL_VALUE;
-
-    // Load data
-    for(int y = 0; y < POOL_SIZE_Y; y++)
-    {
-        int x = 0;
-        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
-        {
-            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            vdata = POOL_OP(vdata, data0);
-        }
-
-        // Leftover
-        for(; x < (int)POOL_SIZE_X; ++x)
-        {
-            ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            sdata = POOL_OP(sdata, data0);
-        }
-    }
-
-    // Reduce result
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
-    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-    res               = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
-{
-    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
-    const int pad_vert  = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
-
-    const int x = get_global_id(0) * STRIDE_X;
-    const int y = get_global_id(1) * STRIDE_Y;
-    const int z = get_global_id(2);
-
-    //x axis: width, y axis: height, z axis: component
-    const uint padded_offset = input->offset_first_element_in_bytes
-                               + x * input->stride_x
-                               + y * input->stride_y
-                               + z * input->stride_z;
-
-    const uint offset_base = padded_offset
-                             - y * pad_horiz * sizeof(DATA_TYPE)                                               /* Horizontal padding for each row */
-                             - PAD_TENSOR_TOP * input->stride_y                                                /* top padding */
-                             - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
-                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
-    *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
-#else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-    *offset_top   = (uint)(offset_base / sizeof(DATA_TYPE));
-#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-
-    *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
-    return;
-}
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp32(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    // Load data
-    float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
-    float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Perform calculations
-    float data0_max = POOL_OP(data0.s0, data0.s1);
-    float data1_max = POOL_OP(data1.s0, data1.s1);
-    float res       = POOL_OP(data0_max, data1_max);
-    // Store result
-    *(__global float *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-    uint offset_top    = 0;
-    uint offset_bottom = 0;
-
-    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
-    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
-    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
-    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
-    *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp16(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    // Load data
-    half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
-    half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Perform calculations
-    half data0_max = POOL_OP(data0.s0, data0.s1);
-    half data1_max = POOL_OP(data1.s0, data1.s1);
-    half res       = POOL_OP(data0_max, data1_max);
-    // Store result
-    *(__global half *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-    uint offset_top    = 0;
-    uint offset_bottom = 0;
-
-    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
-    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
-    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
-    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
-    *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
-    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
-    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
-#else  //DST_BATCH_SIZE != 1
-    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
-    int idx_out_n   = 0;
-#endif // DST_BATCH_SIZE != 1
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
-                                           output_stride_w;
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = INITIAL_VALUE;
-
-    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
-
-#if defined(EXCLUDE_PADDING)
-    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
-#else  // defined(EXCLUDE_PADDING)
-    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
-#endif // defined(EXCLUDE_PADDING)
-
-#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
-    // Global pooling path
-    for(int y = 0; y < POOL_SIZE_Y; ++y)
-    {
-#pragma unroll 8
-        for(int x = 0; x < POOL_SIZE_X; ++x)
-        {
-#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
-    for(int y = pool_y_s; y < pool_y_e; ++y)
-    {
-#pragma unroll 8
-        for(int x = pool_x_s; x < pool_x_e; ++x)
-        {
-#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
-            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-            data0;
-#if defined(FP_MIXED_PRECISION)
-            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else  // defined(FP_MIXED_PRECISION)
-            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif // defined(POOL_L2)
-            res0 = POOL_OP(res0, data0);
-        }
-    }
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
-    // Store result
-#if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else  // defined(FP_MIXED_PRECISION)
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-
-/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
- * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2x2_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output)
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-    ,
-    TENSOR4D_DECLARATION(indices)
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-)
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int idx_out_w = get_global_id(1);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = get_global_id(2) % DST_HEIGHT;
-    int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else  //SRC_BATCH_SIZE != 1
-    int idx_out_h = get_global_id(2);
-    int idx_out_n = 0;
-#endif // SRC_BATCH_SIZE != 1
-
-    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
-                                           output_stride_w;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
-
-    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
-
-    int x0 = pool_x_s + idx_in_w;
-    int y0 = pool_y_s + idx_in_h;
-    int x1 = pool_x_e - 1 + idx_in_w;
-    int y1 = pool_y_e - 1 + idx_in_h;
-
-    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
-
-#if defined(FP_MIXED_PRECISION)
-    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else  // defined(FP_MIXED_PRECISION)
-    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
-    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
-    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
-    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if !defined(POOL_MAX)
-    if(filter_size != 4)
-    {
-        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
-        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
-        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
-        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
-
-        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
-        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
-        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
-        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
-        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
-    }
-#endif // !defined(POOL_MAX)
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 *= data0;
-    data1 *= data1;
-    data2 *= data2;
-    data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = data0;
-    res0 = POOL_OP(res0, data1);
-    res0 = POOL_OP(res0, data2);
-    res0 = POOL_OP(res0, data3);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#if defined(EXCLUDE_PADDING)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#else  // !defined(EXCLUDE_PADDING)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
-#endif // defined(EXCLUDE_PADDING)
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
-    // Store result
-#if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else  // defined(FP_MIXED_PRECISION)
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-
-    // This part is used to return the index of the maximum value
-    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
-
-    // note: Batch dimension does not contribute in the offset contribution
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    base_index = (uint)idx_out_c;
-
-    base_index += VEC_OFFS(uint, VEC_SIZE);
-
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-
-    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
-    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
-    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
-
-    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
-                                           indices_stride_w;
-
-    // Store result
-    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
deleted file mode 100644
index 7983734fc4..0000000000
--- a/src/core/CL/cl_kernels/sobel_filter.cl
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/***********************************************/
-/*   Begin implementation of Sobel3x3 filter   */
-/***********************************************/
-
-/** This OpenCL kernel that computes a Sobel3x3 filter.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel3x3(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    short8 gx = (short8)0;
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = (short8)0;
-#endif /* GRAD_Y */
-
-    // Row0
-    uchar16 temp   = vload16(0, offset(&src, -1, -1));
-    short8  left   = convert_short8(temp.s01234567);
-    short8  middle = convert_short8(temp.s12345678);
-    short8  right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-1);
-    gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(-1);
-    gy += middle * (short8)(-2);
-    gy += right * (short8)(-1);
-#endif /* GRAD_Y */
-
-    // Row1
-    temp  = vload16(0, offset(&src, -1, 0));
-    left  = convert_short8(temp.s01234567);
-    right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-2);
-    gx += right * (short8)(+2);
-#endif /* GRAD_X */
-
-    // Row2
-    temp   = vload16(0, offset(&src, -1, 1));
-    left   = convert_short8(temp.s01234567);
-    middle = convert_short8(temp.s12345678);
-    right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-1);
-    gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(+1);
-    gy += middle * (short8)(+2);
-    gy += right * (short8)(+1);
-#endif /* GRAD_Y */
-
-    // Store results
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel3x3 filter   */
-/**********************************************/
-
-/***********************************************/
-/*   Begin implementation of Sobel5x5 filter   */
-/***********************************************/
-
-/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src             Pointer to source image.
- * @param[in] left1_coeff_gx  Weight of the most left pixel for gx
- * @param[in] left2_coeff_gx  Weight of the left pixel for gx
- * @param[in] middle_coeff_gx Weight of the middle pixel for gx
- * @param[in] right1_coeff_gx Weight of the right pixel for gx
- * @param[in] right2_coeff_gx Weight of the most right pixel for gx
- * @param[in] left1_coeff_gy  Weight of the most left pixel for gy
- * @param[in] left2_coeff_gy  Weight of the left pixel for gy
- * @param[in] middle_coeff_gy Weight of the middle pixel for gy
- * @param[in] right1_coeff_gy Weight of the right pixel for gy
- * @param[in] right2_coeff_gy Weight of the most right pixel for gy
- *
- * @return a short16 containing short8 gx and short8 gy values.
- */
-short16 sobel1x5(
-    Image      *src,
-    const short left1_coeff_gx,
-    const short left2_coeff_gx,
-    const short middle_coeff_gx,
-    const short right1_coeff_gx,
-    const short right2_coeff_gx,
-    const short left1_coeff_gy,
-    const short left2_coeff_gy,
-    const short middle_coeff_gy,
-    const short right1_coeff_gy,
-    const short right2_coeff_gy)
-{
-    uchar16 temp = vload16(0, offset(src, -2, 0));
-    short8  gx   = 0;
-    short8  gy   = 0;
-    short8  val;
-
-    val = convert_short8(temp.s01234567);
-    gx += val * (short8)left1_coeff_gx;
-    gy += val * (short8)left1_coeff_gy;
-
-    val = convert_short8(temp.s12345678);
-    gx += val * (short8)left2_coeff_gx;
-    gy += val * (short8)left2_coeff_gy;
-
-    val = convert_short8(temp.s23456789);
-    gx += val * (short8)middle_coeff_gx;
-    gy += val * (short8)middle_coeff_gy;
-
-    val = convert_short8(temp.s3456789a);
-    gx += val * (short8)right1_coeff_gx;
-    gy += val * (short8)right1_coeff_gy;
-
-    val = convert_short8(temp.s456789ab);
-    gx += val * (short8)right2_coeff_gx;
-    gy += val * (short8)right2_coeff_gy;
-
-    return (short16)(gx, gy);
-}
-
-/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-short8 sobel5x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff)
-{
-    short8 val;
-    short8 out = (short8)0;
-
-    val = vload8(0, (__global short *)offset(src, 0, -2));
-    out += val * (short8)up1_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, -1));
-    out += val * (short8)up2_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 0));
-    out += val * (short8)middle_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 1));
-    out += val * (short8)down1_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 2));
-    out += val * (short8)down2_coeff;
-
-    return (short8)(out);
-}
-
-/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image.. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image.. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x5(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-    short16 gx_gy = sobel1x5(&src,
-                             -1, -2, 0, 2, 1,
-                             1, 4, 6, 4, 1);
-
-    // Store result in dst
-#ifdef GRAD_X
-    vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
- *  and output two single channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_x_ptr                            Pointer to the source image.. Supported data types: S16
- * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S16
- * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable5x1(
-#ifdef GRAD_X
-    IMAGE_DECLARATION(src_x),
-    IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    IMAGE_DECLARATION(src_y),
-    IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
-    int dummy)
-{
-#ifdef GRAD_X
-    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-#ifdef GRAD_X
-    short8 gx = sobel5x1(&src_x,
-                         1, 4, 6, 4, 1);
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = sobel5x1(&src_y,
-                         -1, -2, 0, 2, 1);
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel5x5 filter   */
-/**********************************************/
-
-/***********************************************/
-/*   Begin implementation of Sobel7x7 filter   */
-/***********************************************/
-
-/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
-#define X0 -1
-#define X1 -4
-#define X2 -5
-#define X3 0
-#define X4 5
-#define X5 4
-#define X6 1
-
-/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
-#define Y0 1
-#define Y1 6
-#define Y2 15
-#define Y3 20
-#define Y4 15
-#define Y5 6
-#define Y6 1
-
-/* Calculates single horizontal iteration. */
-#define SOBEL1x1_HOR(src, gx, gy, idx)                               \
-    {                                                                \
-        int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
-        gx += val * X##idx;                                          \
-        gy += val * Y##idx;                                          \
-    }
-
-/* Calculates single vertical iteration. */
-#define SOBEL1x1_VERT(src, g, direction, idx)                          \
-    {                                                                  \
-        int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
-        g += val * (int8)direction##idx;                               \
-    }
-
-/* Calculates a 1x7 horizontal iteration. */
-#define SOBEL1x7(ptr, gx, gy)                        \
-    SOBEL1x1_HOR(ptr, gx, gy, 0)                     \
-    SOBEL1x1_HOR(ptr, gx, gy, 1)                 \
-    SOBEL1x1_HOR(ptr, gx, gy, 2)             \
-    SOBEL1x1_HOR(ptr, gx, gy, 3)         \
-    SOBEL1x1_HOR(ptr, gx, gy, 4)     \
-    SOBEL1x1_HOR(ptr, gx, gy, 5) \
-    SOBEL1x1_HOR(ptr, gx, gy, 6)
-
-/* Calculates a 7x1 vertical iteration. */
-#define SOBEL7x1(ptr, g, direction)                         \
-    SOBEL1x1_VERT(ptr, g, direction, 0)                     \
-    SOBEL1x1_VERT(ptr, g, direction, 1)                 \
-    SOBEL1x1_VERT(ptr, g, direction, 2)             \
-    SOBEL1x1_VERT(ptr, g, direction, 3)         \
-    SOBEL1x1_VERT(ptr, g, direction, 4)     \
-    SOBEL1x1_VERT(ptr, g, direction, 5) \
-    SOBEL1x1_VERT(ptr, g, direction, 6)
-
-/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S32
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S32
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x7(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-    int8 gx = (int8)0;
-    int8 gy = (int8)0;
-
-    SOBEL1x7(&src, gx, gy);
-
-    // Store result in dst
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global int *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global int *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_x_ptr                            Pointer to the source image. Supported data types: S32
- * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S32
- * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable7x1(
-#ifdef GRAD_X
-    IMAGE_DECLARATION(src_x),
-    IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    IMAGE_DECLARATION(src_y),
-    IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
-    int dummy)
-{
-#ifdef GRAD_X
-    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    int8 gx = 0;
-    SOBEL7x1(&src_x, gx, Y);
-    vstore8(gx, 0, (__global int *)dst_gx.ptr);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    int8 gy = 0;
-    SOBEL7x1(&src_y, gy, X);
-    vstore8(gy, 0, (__global int *)dst_gy.ptr);
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel7x7 filter   */
-/**********************************************/
diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp
index 73da93c1f5..1ba307cb1c 100644
--- a/src/core/gpu/cl/ClKernelLibrary.cpp
+++ b/src/core/gpu/cl/ClKernelLibrary.cpp
@@ -177,484 +177,508 @@ namespace opencl
 {
 const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
 {
-    { "activation_layer", "activation_layer.cl" },
-    { "activation_layer_quant", "activation_layer_quant.cl" },
-    { "activation_layer_quant_f32", "activation_layer_quant.cl" },
-    { "arg_min_max_x", "arg_min_max.cl" },
-    { "arg_min_max_y", "arg_min_max.cl" },
-    { "arg_min_max_z", "arg_min_max.cl" },
-    { "arg_min_max_w", "arg_min_max.cl" },
-    { "batch_to_space_nchw", "batch_to_space.cl" },
-    { "batch_to_space_static_nchw", "batch_to_space.cl" },
-    { "batch_to_space_nhwc", "batch_to_space.cl" },
-    { "batch_to_space_static_nhwc", "batch_to_space.cl" },
-    { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" },
-    { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" },
-    { "bitwise_or", "bitwise_op.cl" },
-    { "bitwise_and", "bitwise_op.cl" },
-    { "bitwise_xor", "bitwise_op.cl" },
-    { "bitwise_not", "bitwise_op.cl" },
-    { "bounding_box_transform", "bounding_box_transform.cl" },
-    { "bounding_box_transform_quantized", "bounding_box_transform_quantized.cl" },
-    { "channel_shuffle_nchw", "channel_shuffle.cl" },
-    { "channel_shuffle_nhwc", "channel_shuffle.cl" },
-    { "compare_equal", "comparisons.cl" },
-    { "compare_equal_quantized", "comparisons.cl" },
-    { "compare_notequal", "comparisons.cl" },
-    { "compare_notequal_quantized", "comparisons.cl" },
-    { "compare_greater", "comparisons.cl" },
-    { "compare_greater_quantized", "comparisons.cl" },
-    { "compare_greaterequal", "comparisons.cl" },
-    { "compare_greaterequal_quantized", "comparisons.cl" },
-    { "compare_less", "comparisons.cl" },
-    { "compare_less_quantized", "comparisons.cl" },
-    { "compare_lessequal", "comparisons.cl" },
-    { "compare_lessequal_quantized", "comparisons.cl" },
-    { "concatenate", "concatenate.cl" },
-    { "concatenate_width", "concatenate.cl" },
-    { "concatenate_height", "concatenate.cl" },
-    { "concatenate_width_x2", "concatenate.cl" },
-    { "concatenate_width_x4", "concatenate.cl" },
-    { "col2im", "col2im.cl" },
-    { "cast_down", "cast.cl" },
-    { "cast_up", "cast.cl" },
-    { "convert_fc_weights", "convert_fc_weights.cl" },
-    { "copy_tensor", "copy_tensor.cl" },
-    { "crop_tensor", "crop_tensor.cl" },
-    { "deconvolution_reshape", "deconvolution_layer.cl" },
-    { "deconvolution_upsample", "deconvolution_layer.cl" },
-    { "depth_to_space_nchw", "depth_to_space.cl" },
-    { "depth_to_space_nhwc", "depth_to_space.cl" },
-    { "dequantization_layer", "dequantization_layer.cl" },
-    { "dequantization_layer_per_channel_nhwc", "dequantization_layer.cl" },
-    { "dequantization_layer_per_channel_nchw", "dequantization_layer.cl" },
-    { "dwc_native_fp_nhwc", "dwc_native_fp_nhwc.cl" },
-    { "dwc_native_quantized_nhwc", "dwc_native_quantized_nhwc.cl" },
-    { "direct_convolution_nhwc", "direct_convolution.cl" },
-    { "direct_convolution1x1", "direct_convolution1x1.cl" },
-    { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" },
-    { "direct_convolution3x3", "direct_convolution3x3.cl" },
-    { "direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl" },
-    { "direct_convolution5x5", "direct_convolution5x5.cl" },
-    { "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" },
-    { "direct_convolution_quantized", "direct_convolution_quantized.cl" },
-    { "elementwise_operation_ADD", "elementwise_operation.cl" },
-    { "elementwise_operation_SUB", "elementwise_operation.cl" },
-    { "elementwise_operation_MAX", "elementwise_operation.cl" },
-    { "elementwise_operation_MIN", "elementwise_operation.cl" },
-    { "elementwise_operation_DIV", "elementwise_operation.cl" },
-    { "elementwise_operation_SQUARED_DIFF", "elementwise_operation.cl" },
-    { "elementwise_operation_POWER", "elementwise_operation.cl" },
-    { "elementwise_operation_PRELU", "elementwise_operation.cl" },
-    { "elementwise_operation_AND", "elementwise_operation.cl" },
-    { "elementwise_operation_OR", "elementwise_operation.cl" },
-    { "elementwise_operation_ADD_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SUB_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MAX_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MIN_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_DIV_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_operation_PRELU_quantized", "elementwise_operation_quantized.cl" },
-    { "elementwise_unary", "elementwise_unary.cl" },
-    { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" },
-    { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" },
-    { "fft_radix_2_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_2_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_2_axis_0", "fft.cl" },
-    { "fft_radix_2_axis_1", "fft.cl" },
-    { "fft_radix_3_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_3_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_3_axis_0", "fft.cl" },
-    { "fft_radix_3_axis_1", "fft.cl" },
-    { "fft_radix_4_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_4_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_4_axis_0", "fft.cl" },
-    { "fft_radix_4_axis_1", "fft.cl" },
-    { "fft_radix_5_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_5_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_5_axis_0", "fft.cl" },
-    { "fft_radix_5_axis_1", "fft.cl" },
-    { "fft_radix_7_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_7_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_7_axis_0", "fft.cl" },
-    { "fft_radix_7_axis_1", "fft.cl" },
-    { "fft_radix_8_first_stage_axis_0", "fft.cl" },
-    { "fft_radix_8_first_stage_axis_1", "fft.cl" },
-    { "fft_radix_8_axis_0", "fft.cl" },
-    { "fft_radix_8_axis_1", "fft.cl" },
-    { "fft_scale_conj", "fft_scale.cl" },
-    { "fill_image_borders_constant", "fill_border.cl" },
-    { "fill_image_borders_replicate", "fill_border.cl" },
-    { "floor_layer", "floor.cl" },
-    { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
-    { "gather", "gather.cl" },
-    { "gemm_ma_f16", "gemm.cl" },
-    { "gemm_ma_f32", "gemm.cl" },
-    { "gemm_mv", "gemv.cl" },
-    { "gemm_mv_quantized", "gemv.cl" },
-    { "gemm_mm_interleaved_transposed_f16", "gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f16_acc32", "gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f32", "gemm_v1.cl" },
-    { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm_v1.cl" },
-    { "gemm_mm_floating_point", "gemm_v1.cl" },
-    { "gemm_mm_floating_point_f16_bifrost", "gemm_v1.cl" },
-    { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm_v1.cl" },
-    { "gemm_mm_floating_point_f32_bifrost", "gemm_v1.cl" },
-    { "gemm_mm_floating_point_f32_bifrost_1000", "gemm_v1.cl" },
-    { "gemm_mm_native", "gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt", "gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture", "gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture", "gemm.cl" },
-    { "gemm_lc_vm_f32", "gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
-    { "gemm_reshape_lhs_matrix_t", "gemm.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "gemm.cl" },
-    { "gemm_reshape_rhs_matrix_t", "gemm.cl" },
-    { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
-    { "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" },
-    { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
-    { "gemmlowp_mm_native", "gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t", "gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "gemmlowp.cl" },
-    { "gemmlowp_offset_contribution", "gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" },
-    { "generate_proposals_compute_all_anchors", "generate_proposals.cl" },
-    { "generate_proposals_compute_all_anchors_quantized", "generate_proposals_quantized.cl" },
-    { "im2col1x1_stridex1_nchw", "im2col.cl" },
-    { "im2col3x3_nchw", "im2col.cl" },
-    { "im2col5x5_nchw", "im2col.cl" },
-    { "im2col11x11_padx0_pady0_nchw", "im2col.cl" },
-    { "im2col_generic_nchw", "im2col.cl" },
-    { "im2col_generic_padx0_pady0_nchw", "im2col.cl" },
-    { "im2col3x3_nhwc", "im2col.cl" },
-    { "im2col9x9_nhwc", "im2col.cl" },
-    { "im2col_generic_nhwc", "im2col.cl" },
-    { "instance_normalization", "instance_normalization.cl" },
-    { "compute_mean_var", "instance_normalization.cl" },
-    { "l2_normalize_x", "l2_normalize.cl" },
-    { "l2_normalize_y", "l2_normalize.cl" },
-    { "l2_normalize_z", "l2_normalize.cl" },
-    { "max_unpooling_layer_2", "unpooling_layer.cl" },
-    { "mean_stddev_normalization", "mean_stddev_normalization.cl" },
-    { "memset", "memset.cl" },
-    { "minmax_layer", "minmax_layer.cl" },
-    { "non_max_suppression", "nonmax.cl" },
-    { "normalization_layer_cross_map_nchw", "normalization_layer.cl" },
-    { "normalization_layer_cross_map_nhwc", "normalization_layer.cl" },
-    { "normalization_layer_in_map_nchw", "normalization_layer.cl" },
-    { "normalization_layer_in_map_nhwc", "normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
-    { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" },
-    { "pad_layer_constant", "pad_layer.cl" },
-    { "pad_layer_symmetric_reflect", "pad_layer.cl" },
-    { "permute", "permute.cl" },
-    { "pixelwise_mul_complex", "pixelwise_mul_float.cl" },
-    { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
-    { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
-    { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
-    { "pooling_layer_2", "pooling_layer.cl" },
-    { "pooling_layer_3", "pooling_layer.cl" },
-    { "pooling_layer_optimized_3", "pooling_layer.cl" },
-    { "pooling_layer_7", "pooling_layer.cl" },
-    { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
-    { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
-    { "pooling_layer_2x2_nhwc", "pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
-    { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
-    { "prior_box_layer_nchw", "prior_box_layer.cl" },
-    { "qlstm_layer_normalization", "qlstm_layer_normalization.cl" },
-    { "quantization_layer", "quantization_layer.cl" },
-    { "range", "range.cl" },
-    { "range_quantized", "range.cl" },
-    { "reduction_operation_x", "reduction_operation.cl" },
-    { "reduction_operation_non_parallel_x", "reduction_operation.cl" },
-    { "reduction_operation_y", "reduction_operation.cl" },
-    { "reduction_operation_z", "reduction_operation.cl" },
-    { "reduction_operation_w", "reduction_operation.cl" },
-    { "remap_nearest_neighbour_nchw", "remap.cl" },
-    { "remap_bilinear_nchw", "remap.cl" },
-    { "remap_nearest_neighbour_nhwc", "remap.cl" },
-    { "remap_bilinear_nhwc", "remap.cl" },
-    { "reorg_layer_nchw", "reorg_layer.cl" },
-    { "reorg_layer_nhwc", "reorg_layer.cl" },
-    { "reshape_layer", "reshape_layer.cl" },
-    { "reshape_to_columns", "convolution_layer.cl" },
-    { "reverse", "reverse.cl" },
-    { "roi_align_layer", "roi_align_layer.cl" },
-    { "roi_align_layer_quantized", "roi_align_layer_quantized.cl" },
-    { "roi_pooling_layer", "roi_pooling_layer.cl" },
-    { "scale_nearest_neighbour_nchw", "scale.cl" },
-    { "scale_nearest_neighbour_nhwc", "scale.cl" },
-    { "scale_bilinear_nchw", "scale.cl" },
-    { "scale_bilinear_nhwc", "scale.cl" },
-    { "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
-    { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
-    { "select_same_rank", "select.cl" },
-    { "select_different_rank_2", "select.cl" },
-    { "select_different_rank_n", "select.cl" },
-    { "softmax_layer_norm", "softmax_layer.cl" },
-    { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
-    { "space_to_batch_nchw", "space_to_batch.cl" },
-    { "space_to_batch_static_nchw", "space_to_batch.cl" },
-    { "space_to_batch_nhwc", "space_to_batch.cl" },
-    { "space_to_batch_static_nhwc", "space_to_batch.cl" },
-    { "space_to_depth_nchw", "space_to_depth.cl" },
-    { "space_to_depth_nhwc", "space_to_depth.cl" },
-    { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
-    { "stack_layer", "stack_layer.cl" },
-    { "strided_slice", "slice_ops.cl" },
-    { "tile", "tile.cl" },
-    { "transpose", "transpose.cl" },
-    { "upsample_layer_nchw", "upsample_layer.cl" },
-    { "upsample_layer_nhwc", "upsample_layer.cl" },
-    { "winograd_filter_transform_2x2_3x3_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_3x1_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x3_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nchw", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x2_7x7_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_7x1_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x7_nhwc", "winograd_filter_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz2_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz2_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nchw", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "winograd_input_transform.cl" },
-    { "winograd_output_transform_2x2_3x3_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_3x1_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x3_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nchw", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_2x2_7x7_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_7x1_nhwc", "winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" },
+    { "activation_layer", "common/activation_layer.cl" },
+    { "activation_layer_quant", "common/activation_layer_quant.cl" },
+    { "activation_layer_quant_f32", "common/activation_layer_quant.cl" },
+    { "arg_min_max_x", "common/arg_min_max.cl" },
+    { "arg_min_max_y", "common/arg_min_max.cl" },
+    { "arg_min_max_z", "common/arg_min_max.cl" },
+    { "arg_min_max_w", "common/arg_min_max.cl" },
+    { "batch_to_space_nchw", "nchw/batch_to_space.cl" },
+    { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" },
+    { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" },
+    { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" },
+    { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" },
+    { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" },
+    { "bitwise_or", "common/bitwise_op.cl" },
+    { "bitwise_and", "common/bitwise_op.cl" },
+    { "bitwise_xor", "common/bitwise_op.cl" },
+    { "bitwise_not", "common/bitwise_op.cl" },
+    { "bounding_box_transform", "common/bounding_box_transform.cl" },
+    { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" },
+    { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" },
+    { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" },
+    { "compare_equal", "common/comparisons.cl" },
+    { "compare_equal_quantized", "common/comparisons.cl" },
+    { "compare_notequal", "common/comparisons.cl" },
+    { "compare_notequal_quantized", "common/comparisons.cl" },
+    { "compare_greater", "common/comparisons.cl" },
+    { "compare_greater_quantized", "common/comparisons.cl" },
+    { "compare_greaterequal", "common/comparisons.cl" },
+    { "compare_greaterequal_quantized", "common/comparisons.cl" },
+    { "compare_less", "common/comparisons.cl" },
+    { "compare_less_quantized", "common/comparisons.cl" },
+    { "compare_lessequal", "common/comparisons.cl" },
+    { "compare_lessequal_quantized", "common/comparisons.cl" },
+    { "concatenate", "common/concatenate.cl" },
+    { "concatenate_width", "common/concatenate.cl" },
+    { "concatenate_height", "common/concatenate.cl" },
+    { "concatenate_width_x2", "common/concatenate.cl" },
+    { "concatenate_width_x4", "common/concatenate.cl" },
+    { "col2im", "common/col2im.cl" },
+    { "cast_down", "common/cast.cl" },
+    { "cast_up", "common/cast.cl" },
+    { "convert_fc_weights", "common/convert_fc_weights.cl" },
+    { "copy_tensor", "common/copy_tensor.cl" },
+    { "crop_tensor", "common/crop_tensor.cl" },
+    { "deconvolution_reshape", "common/deconvolution_layer.cl" },
+    { "deconvolution_upsample", "common/deconvolution_layer.cl" },
+    { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
+    { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" },
+    { "dequantization_layer", "common/dequantization_layer.cl" },
+    { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" },
+    { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
+    { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" },
+    { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" },
+    { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" },
+    { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
+    { "direct_convolution1x1_f32_bifrost", "nchw/direct_convolution1x1.cl" },
+    { "direct_convolution3x3", "nchw/direct_convolution3x3.cl" },
+    { "direct_convolution3x3_f32_bifrost", "nchw/direct_convolution3x3.cl" },
+    { "direct_convolution5x5", "nchw/direct_convolution5x5.cl" },
+    { "direct_convolution5x5_f32_bifrost", "nchw/direct_convolution5x5.cl" },
+    { "direct_convolution_quantized", "nchw/direct_convolution_quantized.cl" },
+    { "elementwise_operation_ADD", "common/elementwise_operation.cl" },
+    { "elementwise_operation_SUB", "common/elementwise_operation.cl" },
+    { "elementwise_operation_MAX", "common/elementwise_operation.cl" },
+    { "elementwise_operation_MIN", "common/elementwise_operation.cl" },
+    { "elementwise_operation_DIV", "common/elementwise_operation.cl" },
+    { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" },
+    { "elementwise_operation_POWER", "common/elementwise_operation.cl" },
+    { "elementwise_operation_PRELU", "common/elementwise_operation.cl" },
+    { "elementwise_operation_AND", "common/elementwise_operation.cl" },
+    { "elementwise_operation_OR", "common/elementwise_operation.cl" },
+    { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" },
+    { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" },
+    { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" },
+    { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" },
+    { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" },
+    { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
+    { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
+    { "elementwise_unary", "common/elementwise_unary.cl" },
+    { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
+    { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
+    { "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
+    { "fft_radix_2_first_stage_axis_1", "common/fft.cl" },
+    { "fft_radix_2_axis_0", "common/fft.cl" },
+    { "fft_radix_2_axis_1", "common/fft.cl" },
+    { "fft_radix_3_first_stage_axis_0", "common/fft.cl" },
+    { "fft_radix_3_first_stage_axis_1", "common/fft.cl" },
+    { "fft_radix_3_axis_0", "common/fft.cl" },
+    { "fft_radix_3_axis_1", "common/fft.cl" },
+    { "fft_radix_4_first_stage_axis_0", "common/fft.cl" },
+    { "fft_radix_4_first_stage_axis_1", "common/fft.cl" },
+    { "fft_radix_4_axis_0", "common/fft.cl" },
+    { "fft_radix_4_axis_1", "common/fft.cl" },
+    { "fft_radix_5_first_stage_axis_0", "common/fft.cl" },
+    { "fft_radix_5_first_stage_axis_1", "common/fft.cl" },
+    { "fft_radix_5_axis_0", "common/fft.cl" },
+    { "fft_radix_5_axis_1", "common/fft.cl" },
+    { "fft_radix_7_first_stage_axis_0", "common/fft.cl" },
+    { "fft_radix_7_first_stage_axis_1", "common/fft.cl" },
+    { "fft_radix_7_axis_0", "common/fft.cl" },
+    { "fft_radix_7_axis_1", "common/fft.cl" },
+    { "fft_radix_8_first_stage_axis_0", "common/fft.cl" },
+    { "fft_radix_8_first_stage_axis_1", "common/fft.cl" },
+    { "fft_radix_8_axis_0", "common/fft.cl" },
+    { "fft_radix_8_axis_1", "common/fft.cl" },
+    { "fft_scale_conj", "common/fft_scale.cl" },
+    { "fill_image_borders_constant", "common/fill_border.cl" },
+    { "fill_image_borders_replicate", "common/fill_border.cl" },
+    { "floor_layer", "common/floor.cl" },
+    { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" },
+    { "gather", "common/gather.cl" },
+    { "gemm_ma_f16", "common/gemm.cl" },
+    { "gemm_ma_f32", "common/gemm.cl" },
+    { "gemm_mv", "common/gemv.cl" },
+    { "gemm_mv_quantized", "common/gemv.cl" },
+    { "gemm_mm_interleaved_transposed_f16", "common/gemm_v1.cl" },
+    { "gemm_mm_interleaved_transposed_f16_acc32", "common/gemm_v1.cl" },
+    { "gemm_mm_interleaved_transposed_f16_bifrost", "common/gemm_v1.cl" },
+    { "gemm_mm_interleaved_transposed_f32", "common/gemm_v1.cl" },
+    { "gemm_mm_interleaved_transposed_f32_bifrost", "common/gemm_v1.cl" },
+    { "gemm_mm_floating_point", "common/gemm_v1.cl" },
+    { "gemm_mm_floating_point_f16_bifrost", "common/gemm_v1.cl" },
+    { "gemm_mm_floating_point_f16_bifrost_acc32", "common/gemm_v1.cl" },
+    { "gemm_mm_floating_point_f32_bifrost", "common/gemm_v1.cl" },
+    { "gemm_mm_floating_point_f32_bifrost_1000", "common/gemm_v1.cl" },
+    { "gemm_mm_native", "common/gemm.cl" },
+    { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
+    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
+    { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
+    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
+    { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
+    { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
+    { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
+    { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
+    { "gemm_lc_vm_f32", "common/gemm.cl" },
+    { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" },
+    { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" },
+    { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" },
+    { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" },
+    { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
+    { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
+    { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
+    { "gemmlowp_mm_native", "common/gemmlowp.cl" },
+    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
+    { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
+    { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
+    { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
+    { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
+    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
+    { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" },
+    { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" },
+    { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" },
+    { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" },
+    { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" },
+    { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" },
+    { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
+    { "im2col3x3_nchw", "nchw/im2col.cl" },
+    { "im2col5x5_nchw", "nchw/im2col.cl" },
+    { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" },
+    { "im2col_generic_nchw", "nchw/im2col.cl" },
+    { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" },
+    { "im2col3x3_nhwc", "nhwc/im2col.cl" },
+    { "im2col9x9_nhwc", "nhwc/im2col.cl" },
+    { "im2col_generic_nhwc", "nhwc/im2col.cl" },
+    { "instance_normalization", "common/instance_normalization.cl" },
+    { "compute_mean_var", "common/instance_normalization.cl" },
+    { "l2_normalize_x", "common/l2_normalize.cl" },
+    { "l2_normalize_y", "common/l2_normalize.cl" },
+    { "l2_normalize_z", "common/l2_normalize.cl" },
+    { "max_unpooling_layer_2", "common/unpooling_layer.cl" },
+    { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" },
+    { "memset", "common/memset.cl" },
+    { "minmax_layer", "common/minmax_layer.cl" },
+    { "non_max_suppression", "common/nonmax.cl" },
+    { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" },
+    { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" },
+    { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" },
+    { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" },
+    { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
+    { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" },
+    { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
+    { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" },
+    { "pad_layer_constant", "common/pad_layer.cl" },
+    { "pad_layer_symmetric_reflect", "common/pad_layer.cl" },
+    { "permute", "common/permute.cl" },
+    { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" },
+    { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
+    { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
+    { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
+    { "pooling_layer_2", "common/pooling_layer.cl" },
+    { "pooling_layer_3", "common/pooling_layer.cl" },
+    { "pooling_layer_optimized_3", "common/pooling_layer.cl" },
+    { "pooling_layer_7", "common/pooling_layer.cl" },
+    { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
+    { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
+    { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
+    { "pooling_layer_2_nchw_indices_fp32", "nchw/pooling_layer.cl" },
+    { "pooling_layer_2_nchw_indices_fp16", "nchw/pooling_layer.cl" },
+    { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
+    { "pooling_layer_MxN_quantized_nchw", "nchw/pooling_layer_quantized.cl" },
+    { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
+    { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
+    { "quantization_layer", "common/quantization_layer.cl" },
+    { "range", "common/range.cl" },
+    { "range_quantized", "common/range.cl" },
+    { "reduction_operation_x", "common/reduction_operation.cl" },
+    { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" },
+    { "reduction_operation_y", "common/reduction_operation.cl" },
+    { "reduction_operation_z", "common/reduction_operation.cl" },
+    { "reduction_operation_w", "common/reduction_operation.cl" },
+    { "remap_nearest_neighbour_nchw", "nchw/remap.cl" },
+    { "remap_bilinear_nchw", "nchw/remap.cl" },
+    { "remap_nearest_neighbour_nhwc", "nhwc/remap.cl" },
+    { "remap_bilinear_nhwc", "nhwc/remap.cl" },
+    { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
+    { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
+    { "reshape_layer", "common/reshape_layer.cl" },
+    { "reshape_to_columns", "common/convolution_layer.cl" },
+    { "reverse", "common/reverse.cl" },
+    { "roi_align_layer", "common/roi_align_layer.cl" },
+    { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" },
+    { "roi_pooling_layer", "common/roi_pooling_layer.cl" },
+    { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
+    { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
+    { "scale_bilinear_nchw", "nchw/scale.cl" },
+    { "scale_bilinear_nhwc", "nhwc/scale.cl" },
+    { "scale_bilinear_quantized_nchw", "nchw/scale_quantized.cl" },
+    { "scale_bilinear_quantized_nhwc", "nhwc/scale_quantized.cl" },
+    { "select_same_rank", "common/select.cl" },
+    { "select_different_rank_2", "common/select.cl" },
+    { "select_different_rank_n", "common/select.cl" },
+    { "softmax_layer_norm", "common/softmax_layer.cl" },
+    { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" },
+    { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" },
+    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" },
+    { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" },
+    { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
+    { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
+    { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
+    { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
+    { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
+    { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
+    { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" },
+    { "stack_layer", "common/stack_layer.cl" },
+    { "strided_slice", "common/slice_ops.cl" },
+    { "tile", "common/tile.cl" },
+    { "transpose", "common/transpose.cl" },
+    { "upsample_layer_nchw", "nchw/upsample_layer.cl" },
+    { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" },
+    { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" },
+    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+    { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" },
+    { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" },
+    { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" },
+    { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" },
+    { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" },
+    { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" },
+    { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" },
+    { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" },
+    { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" },
+    { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" },
 };
 
 const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
 {
 #ifdef EMBEDDED_KERNELS
     {
-        "activation_layer.cl",
-#include "./cl_kernels/activation_layer.clembed"
+        "common/activation_layer.cl",
+#include "./cl_kernels/common/activation_layer.clembed"
     },
     {
-        "activation_layer_quant.cl",
-#include "./cl_kernels/activation_layer_quant.clembed"
+        "common/activation_layer_quant.cl",
+#include "./cl_kernels/common/activation_layer_quant.clembed"
     },
     {
-        "arg_min_max.cl",
-#include "./cl_kernels/arg_min_max.clembed"
+        "common/arg_min_max.cl",
+#include "./cl_kernels/common/arg_min_max.clembed"
     },
     {
-        "batch_to_space.cl",
-#include "./cl_kernels/batch_to_space.clembed"
+        "nchw/batch_to_space.cl",
+#include "./cl_kernels/nchw/batch_to_space.clembed"
     },
     {
-        "bitwise_op.cl",
-#include "./cl_kernels/bitwise_op.clembed"
+        "nhwc/batch_to_space.cl",
+#include "./cl_kernels/nhwc/batch_to_space.clembed"
     },
     {
-        "bounding_box_transform.cl",
-#include "./cl_kernels/bounding_box_transform.clembed"
+        "common/bitwise_op.cl",
+#include "./cl_kernels/common/bitwise_op.clembed"
     },
     {
-        "bounding_box_transform_quantized.cl",
-#include "./cl_kernels/bounding_box_transform_quantized.clembed"
+        "common/bounding_box_transform.cl",
+#include "./cl_kernels/common/bounding_box_transform.clembed"
     },
     {
-        "channel_shuffle.cl",
-#include "./cl_kernels/channel_shuffle.clembed"
+        "common/bounding_box_transform_quantized.cl",
+#include "./cl_kernels/common/bounding_box_transform_quantized.clembed"
     },
     {
-        "col2im.cl",
-#include "./cl_kernels/col2im.clembed"
+        "nchw/channel_shuffle.cl",
+#include "./cl_kernels/nchw/channel_shuffle.clembed"
     },
     {
-        "comparisons.cl",
-#include "./cl_kernels/comparisons.clembed"
+        "nhwc/channel_shuffle.cl",
+#include "./cl_kernels/nhwc/channel_shuffle.clembed"
     },
     {
-        "concatenate.cl",
-#include "./cl_kernels/concatenate.clembed"
+        "common/col2im.cl",
+#include "./cl_kernels/common/col2im.clembed"
     },
     {
-        "convert_fc_weights.cl",
-#include "./cl_kernels/convert_fc_weights.clembed"
+        "common/comparisons.cl",
+#include "./cl_kernels/common/comparisons.clembed"
     },
     {
-        "convolution_layer.cl",
-#include "./cl_kernels/convolution_layer.clembed"
+        "common/concatenate.cl",
+#include "./cl_kernels/common/concatenate.clembed"
     },
     {
-        "copy_tensor.cl",
-#include "./cl_kernels/copy_tensor.clembed"
+        "common/convert_fc_weights.cl",
+#include "./cl_kernels/common/convert_fc_weights.clembed"
     },
     {
-        "crop_tensor.cl",
-#include "./cl_kernels/crop_tensor.clembed"
+        "common/convolution_layer.cl",
+#include "./cl_kernels/common/convolution_layer.clembed"
     },
     {
-        "upsample_layer.cl",
-#include "./cl_kernels/upsample_layer.clembed"
+        "common/copy_tensor.cl",
+#include "./cl_kernels/common/copy_tensor.clembed"
     },
     {
-        "deconvolution_layer.cl",
-#include "./cl_kernels/deconvolution_layer.clembed"
+        "common/crop_tensor.cl",
+#include "./cl_kernels/common/crop_tensor.clembed"
     },
     {
-        "cast.cl",
-#include "./cl_kernels/cast.clembed"
+        "nchw/upsample_layer.cl",
+#include "./cl_kernels/nchw/upsample_layer.clembed"
     },
     {
-        "depth_to_space.cl",
-#include "./cl_kernels/depth_to_space.clembed"
+        "nhwc/upsample_layer.cl",
+#include "./cl_kernels/nhwc/upsample_layer.clembed"
     },
     {
-        "dequantization_layer.cl",
-#include "./cl_kernels/dequantization_layer.clembed"
+        "common/deconvolution_layer.cl",
+#include "./cl_kernels/common/deconvolution_layer.clembed"
     },
     {
-        "direct_convolution1x1.cl",
-#include "./cl_kernels/direct_convolution1x1.clembed"
+        "common/cast.cl",
+#include "./cl_kernels/common/cast.clembed"
     },
     {
-        "direct_convolution3x3.cl",
-#include "./cl_kernels/direct_convolution3x3.clembed"
+        "nchw/depth_to_space.cl",
+#include "./cl_kernels/nchw/depth_to_space.clembed"
     },
     {
-        "direct_convolution5x5.cl",
-#include "./cl_kernels/direct_convolution5x5.clembed"
+        "nhwc/depth_to_space.cl",
+#include "./cl_kernels/nhwc/depth_to_space.clembed"
     },
     {
-        "direct_convolution_quantized.cl",
-#include "./cl_kernels/direct_convolution_quantized.clembed"
+        "common/dequantization_layer.cl",
+#include "./cl_kernels/common/dequantization_layer.clembed"
     },
     {
-        "direct_convolution.cl",
-#include "./cl_kernels/direct_convolution.clembed"
+        "nchw/dequantization_layer.cl",
+#include "./cl_kernels/nchw/dequantization_layer.clembed"
     },
     {
-        "dwc_native_fp_nhwc.cl",
-#include "./cl_kernels/dwc_native_fp_nhwc.clembed"
+        "nhwc/dequantization_layer.cl",
+#include "./cl_kernels/nhwc/dequantization_layer.clembed"
     },
     {
-        "dwc_native_quantized_nhwc.cl",
-#include "./cl_kernels/dwc_native_quantized_nhwc.clembed"
+        "nchw/direct_convolution1x1.cl",
+#include "./cl_kernels/nchw/direct_convolution1x1.clembed"
     },
     {
-        "elementwise_operation.cl",
-#include "./cl_kernels/elementwise_operation.clembed"
+        "nchw/direct_convolution3x3.cl",
+#include "./cl_kernels/nchw/direct_convolution3x3.clembed"
     },
     {
-        "elementwise_operation_quantized.cl",
-#include "./cl_kernels/elementwise_operation_quantized.clembed"
+        "nchw/direct_convolution5x5.cl",
+#include "./cl_kernels/nchw/direct_convolution5x5.clembed"
     },
     {
-        "elementwise_unary.cl",
-#include "./cl_kernels/elementwise_unary.clembed"
+        "nchw/direct_convolution_quantized.cl",
+#include "./cl_kernels/nchw/direct_convolution_quantized.clembed"
     },
     {
-        "fft.cl",
-#include "./cl_kernels/fft.clembed"
+        "nhwc/direct_convolution.cl",
+#include "./cl_kernels/nhwc/direct_convolution.clembed"
     },
     {
-        "fft_digit_reverse.cl",
-#include "./cl_kernels/fft_digit_reverse.clembed"
+        "nhwc/dwc_native_fp_nhwc.cl",
+#include "./cl_kernels/nhwc/dwc_native_fp_nhwc.clembed"
     },
     {
-        "fft_scale.cl",
-#include "./cl_kernels/fft_scale.clembed"
+        "nhwc/dwc_native_quantized_nhwc.cl",
+#include "./cl_kernels/nhwc/dwc_native_quantized_nhwc.clembed"
     },
     {
-        "fill_border.cl",
-#include "./cl_kernels/fill_border.clembed"
+        "common/elementwise_operation.cl",
+#include "./cl_kernels/common/elementwise_operation.clembed"
     },
     {
-        "floor.cl",
-#include "./cl_kernels/floor.clembed"
+        "common/elementwise_operation_quantized.cl",
+#include "./cl_kernels/common/elementwise_operation_quantized.clembed"
     },
     {
-        "gather.cl",
-#include "./cl_kernels/gather.clembed"
+        "common/elementwise_unary.cl",
+#include "./cl_kernels/common/elementwise_unary.clembed"
     },
     {
-        "gemm.cl",
-#include "./cl_kernels/gemm.clembed"
+        "common/fft.cl",
+#include "./cl_kernels/common/fft.clembed"
     },
     {
-        "gemm_v1.cl",
-#include "./cl_kernels/gemm_v1.clembed"
+        "common/fft_digit_reverse.cl",
+#include "./cl_kernels/common/fft_digit_reverse.clembed"
     },
     {
-        "gemmlowp.cl",
-#include "./cl_kernels/gemmlowp.clembed"
+        "common/fft_scale.cl",
+#include "./cl_kernels/common/fft_scale.clembed"
     },
     {
-        "gemv.cl",
-#include "./cl_kernels/gemv.clembed"
+        "common/fill_border.cl",
+#include "./cl_kernels/common/fill_border.clembed"
     },
     {
-        "generate_proposals.cl",
-#include "./cl_kernels/generate_proposals.clembed"
+        "common/floor.cl",
+#include "./cl_kernels/common/floor.clembed"
     },
     {
-        "generate_proposals_quantized.cl",
-#include "./cl_kernels/generate_proposals_quantized.clembed"
+        "common/gather.cl",
+#include "./cl_kernels/common/gather.clembed"
+    },
+    {
+        "common/gemm.cl",
+#include "./cl_kernels/common/gemm.clembed"
+    },
+    {
+        "common/gemm_v1.cl",
+#include "./cl_kernels/common/gemm_v1.clembed"
+    },
+    {
+        "common/gemmlowp.cl",
+#include "./cl_kernels/common/gemmlowp.clembed"
+    },
+    {
+        "common/gemv.cl",
+#include "./cl_kernels/common/gemv.clembed"
+    },
+    {
+        "common/generate_proposals.cl",
+#include "./cl_kernels/common/generate_proposals.clembed"
+    },
+    {
+        "common/generate_proposals_quantized.cl",
+#include "./cl_kernels/common/generate_proposals_quantized.clembed"
     },
     {
         "helpers.h",
@@ -665,184 +689,256 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
 #include "./cl_kernels/helpers_asymm.hembed"
     },
     {
-        "im2col.cl",
-#include "./cl_kernels/im2col.clembed"
+        "nchw/im2col.cl",
+#include "./cl_kernels/nchw/im2col.clembed"
+    },
+    {
+        "nhwc/im2col.cl",
+#include "./cl_kernels/nhwc/im2col.clembed"
+    },
+    {
+        "common/instance_normalization.cl",
+#include "./cl_kernels/common/instance_normalization.clembed"
+    },
+    {
+        "common/l2_normalize.cl",
+#include "./cl_kernels/common/l2_normalize.clembed"
+    },
+    {
+        "common/mean_stddev_normalization.cl",
+#include "./cl_kernels/common/mean_stddev_normalization.clembed"
+    },
+    {
+        "common/memset.cl",
+#include "./cl_kernels/common/memset.clembed"
+    },
+    {
+        "common/minmax_layer.cl",
+#include "./cl_kernels/common/minmax_layer.clembed"
     },
     {
-        "instance_normalization.cl",
-#include "./cl_kernels/instance_normalization.clembed"
+        "common/nonmax.cl",
+#include "./cl_kernels/common/nonmax.clembed"
     },
     {
-        "l2_normalize.cl",
-#include "./cl_kernels/l2_normalize.clembed"
+        "nchw/normalization_layer.cl",
+#include "./cl_kernels/nchw/normalization_layer.clembed"
     },
     {
-        "mean_stddev_normalization.cl",
-#include "./cl_kernels/mean_stddev_normalization.clembed"
+        "nhwc/normalization_layer.cl",
+#include "./cl_kernels/nhwc/normalization_layer.clembed"
     },
     {
-        "memset.cl",
-#include "./cl_kernels/memset.clembed"
+        "nchw/normalize_planar_yuv_layer.cl",
+#include "./cl_kernels/nchw/normalize_planar_yuv_layer.clembed"
     },
     {
-        "minmax_layer.cl",
-#include "./cl_kernels/minmax_layer.clembed"
+        "nhwc/normalize_planar_yuv_layer.cl",
+#include "./cl_kernels/nhwc/normalize_planar_yuv_layer.clembed"
     },
     {
-        "nonmax.cl",
-#include "./cl_kernels/nonmax.clembed"
+        "nchw/normalize_planar_yuv_layer_quantized.cl",
+#include "./cl_kernels/nchw/normalize_planar_yuv_layer_quantized.clembed"
     },
     {
-        "normalization_layer.cl",
-#include "./cl_kernels/normalization_layer.clembed"
+        "nhwc/normalize_planar_yuv_layer_quantized.cl",
+#include "./cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.clembed"
     },
     {
-        "normalize_planar_yuv_layer.cl",
-#include "./cl_kernels/normalize_planar_yuv_layer.clembed"
+        "common/batchnormalization_layer.cl",
+#include "./cl_kernels/common/batchnormalization_layer.clembed"
     },
     {
-        "normalize_planar_yuv_layer_quantized.cl",
-#include "./cl_kernels/normalize_planar_yuv_layer_quantized.clembed"
+        "nchw/batchnormalization_layer.cl",
+#include "./cl_kernels/nchw/batchnormalization_layer.clembed"
     },
     {
-        "batchnormalization_layer.cl",
-#include "./cl_kernels/batchnormalization_layer.clembed"
+        "nhwc/batchnormalization_layer.cl",
+#include "./cl_kernels/nhwc/batchnormalization_layer.clembed"
     },
     {
-        "pad_layer.cl",
-#include "./cl_kernels/pad_layer.clembed"
+        "common/pad_layer.cl",
+#include "./cl_kernels/common/pad_layer.clembed"
     },
     {
-        "permute.cl",
-#include "./cl_kernels/permute.clembed"
+        "common/permute.cl",
+#include "./cl_kernels/common/permute.clembed"
     },
     {
-        "pixelwise_mul_float.cl",
-#include "./cl_kernels/pixelwise_mul_float.clembed"
+        "common/pixelwise_mul_float.cl",
+#include "./cl_kernels/common/pixelwise_mul_float.clembed"
     },
     {
-        "pixelwise_mul_int.cl",
-#include "./cl_kernels/pixelwise_mul_int.clembed"
+        "common/pixelwise_mul_int.cl",
+#include "./cl_kernels/common/pixelwise_mul_int.clembed"
     },
     {
-        "pooling_layer.cl",
-#include "./cl_kernels/pooling_layer.clembed"
+        "common/pooling_layer.cl",
+#include "./cl_kernels/common/pooling_layer.clembed"
     },
     {
-        "pooling_layer_quantized.cl",
-#include "./cl_kernels/pooling_layer_quantized.clembed"
+        "nchw/pooling_layer.cl",
+#include "./cl_kernels/nchw/pooling_layer.clembed"
     },
     {
-        "prior_box_layer.cl",
-#include "./cl_kernels/prior_box_layer.clembed"
+        "nhwc/pooling_layer.cl",
+#include "./cl_kernels/nhwc/pooling_layer.clembed"
     },
     {
-        "qlstm_layer_normalization.cl",
-#include "./cl_kernels/qlstm_layer_normalization.clembed"
+        "nchw/pooling_layer_quantized.cl",
+#include "./cl_kernels/nchw/pooling_layer_quantized.clembed"
     },
     {
-        "quantization_layer.cl",
-#include "./cl_kernels/quantization_layer.clembed"
+        "nhwc/pooling_layer_quantized.cl",
+#include "./cl_kernels/nhwc/pooling_layer_quantized.clembed"
     },
     {
-        "range.cl",
-#include "./cl_kernels/range.clembed"
+        "nchw/prior_box_layer.cl",
+#include "./cl_kernels/nchw/prior_box_layer.clembed"
     },
     {
-        "reduction_operation.cl",
-#include "./cl_kernels/reduction_operation.clembed"
+        "common/qlstm_layer_normalization.cl",
+#include "./cl_kernels/common/qlstm_layer_normalization.clembed"
     },
     {
-        "remap.cl",
-#include "./cl_kernels/remap.clembed"
+        "common/quantization_layer.cl",
+#include "./cl_kernels/common/quantization_layer.clembed"
     },
     {
-        "reorg_layer.cl",
-#include "./cl_kernels/reorg_layer.clembed"
+        "common/range.cl",
+#include "./cl_kernels/common/range.clembed"
     },
     {
-        "reshape_layer.cl",
-#include "./cl_kernels/reshape_layer.clembed"
+        "common/reduction_operation.cl",
+#include "./cl_kernels/common/reduction_operation.clembed"
     },
     {
-        "reverse.cl",
-#include "./cl_kernels/reverse.clembed"
+        "nchw/remap.cl",
+#include "./cl_kernels/nchw/remap.clembed"
     },
     {
-        "roi_align_layer.cl",
-#include "./cl_kernels/roi_align_layer.clembed"
+        "nhwc/remap.cl",
+#include "./cl_kernels/nhwc/remap.clembed"
     },
     {
-        "roi_align_layer_quantized.cl",
-#include "./cl_kernels/roi_align_layer_quantized.clembed"
+        "nchw/reorg_layer.cl",
+#include "./cl_kernels/nchw/reorg_layer.clembed"
     },
     {
-        "roi_pooling_layer.cl",
-#include "./cl_kernels/roi_pooling_layer.clembed"
+        "nhwc/reorg_layer.cl",
+#include "./cl_kernels/nhwc/reorg_layer.clembed"
     },
     {
-        "scale.cl",
-#include "./cl_kernels/scale.clembed"
+        "common/reshape_layer.cl",
+#include "./cl_kernels/common/reshape_layer.clembed"
     },
     {
-        "scale_quantized.cl",
-#include "./cl_kernels/scale_quantized.clembed"
+        "common/reverse.cl",
+#include "./cl_kernels/common/reverse.clembed"
     },
     {
-        "select.cl",
-#include "./cl_kernels/select.clembed"
+        "common/roi_align_layer.cl",
+#include "./cl_kernels/common/roi_align_layer.clembed"
     },
     {
-        "softmax_layer.cl",
-#include "./cl_kernels/softmax_layer.clembed"
+        "common/roi_align_layer_quantized.cl",
+#include "./cl_kernels/common/roi_align_layer_quantized.clembed"
     },
     {
-        "softmax_layer_quantized.cl",
-#include "./cl_kernels/softmax_layer_quantized.clembed"
+        "common/roi_pooling_layer.cl",
+#include "./cl_kernels/common/roi_pooling_layer.clembed"
     },
     {
-        "slice_ops.cl",
-#include "./cl_kernels/slice_ops.clembed"
+        "nchw/scale.cl",
+#include "./cl_kernels/nchw/scale.clembed"
     },
     {
-        "space_to_batch.cl",
-#include "./cl_kernels/space_to_batch.clembed"
+        "nhwc/scale.cl",
+#include "./cl_kernels/nhwc/scale.clembed"
     },
     {
-        "space_to_depth.cl",
-#include "./cl_kernels/space_to_depth.clembed"
+        "nchw/scale_quantized.cl",
+#include "./cl_kernels/nchw/scale_quantized.clembed"
     },
     {
-        "stack_layer.cl",
-#include "./cl_kernels/stack_layer.clembed"
+        "nhwc/scale_quantized.cl",
+#include "./cl_kernels/nhwc/scale_quantized.clembed"
     },
     {
-        "tile.cl",
-#include "./cl_kernels/tile.clembed"
+        "common/select.cl",
+#include "./cl_kernels/common/select.clembed"
     },
     {
-        "transpose.cl",
-#include "./cl_kernels/transpose.clembed"
+        "common/softmax_layer.cl",
+#include "./cl_kernels/common/softmax_layer.clembed"
+    },
+    {
+        "common/softmax_layer_quantized.cl",
+#include "./cl_kernels/common/softmax_layer_quantized.clembed"
+    },
+    {
+        "common/slice_ops.cl",
+#include "./cl_kernels/common/slice_ops.clembed"
+    },
+    {
+        "nchw/space_to_batch.cl",
+#include "./cl_kernels/nchw/space_to_batch.clembed"
+    },
+    {
+        "nhwc/space_to_batch.cl",
+#include "./cl_kernels/nhwc/space_to_batch.clembed"
+    },
+    {
+        "nchw/space_to_depth.cl",
+#include "./cl_kernels/nchw/space_to_depth.clembed"
+    },
+    {
+        "nhwc/space_to_depth.cl",
+#include "./cl_kernels/nhwc/space_to_depth.clembed"
+    },
+    {
+        "common/stack_layer.cl",
+#include "./cl_kernels/common/stack_layer.clembed"
+    },
+    {
+        "common/tile.cl",
+#include "./cl_kernels/common/tile.clembed"
+    },
+    {
+        "common/transpose.cl",
+#include "./cl_kernels/common/transpose.clembed"
     },
     {
         "types.h",
 #include "./cl_kernels/types.hembed"
     },
     {
-        "unpooling_layer.cl",
-#include "./cl_kernels/unpooling_layer.clembed"
+        "common/unpooling_layer.cl",
+#include "./cl_kernels/common/unpooling_layer.clembed"
+    },
+    {
+        "nchw/winograd_filter_transform.cl",
+#include "./cl_kernels/nchw/winograd_filter_transform.clembed"
+    },
+    {
+        "nhwc/winograd_filter_transform.cl",
+#include "./cl_kernels/nhwc/winograd_filter_transform.clembed"
+    },
+    {
+        "nchw/winograd_input_transform.cl",
+#include "./cl_kernels/nchw/winograd_input_transform.clembed"
     },
     {
-        "winograd_filter_transform.cl",
-#include "./cl_kernels/winograd_filter_transform.clembed"
+        "nhwc/winograd_input_transform.cl",
+#include "./cl_kernels/nhwc/winograd_input_transform.clembed"
     },
     {
-        "winograd_input_transform.cl",
-#include "./cl_kernels/winograd_input_transform.clembed"
+        "nchw/winograd_output_transform.cl",
+#include "./cl_kernels/nchw/winograd_output_transform.clembed"
     },
     {
-        "winograd_output_transform.cl",
-#include "./cl_kernels/winograd_output_transform.clembed"
+        "nhwc/winograd_output_transform.cl",
+#include "./cl_kernels/nhwc/winograd_output_transform.clembed"
     },
 #endif /* EMBEDDED_KERNELS */
 };
author	Adnan AlSinan <adnan.alsinan@arm.com>	2021-07-05 13:12:52 +0100
committer	Georgios Pinitas <georgios.pinitas@arm.com>	2021-07-25 13:04:23 +0000
commit	7075fe2c5ee6f7cfe7cfd9454d905235e70b9ac4 (patch)
tree	b65671bdf37eb1ef8cc30ef64ab572da795546fa
parent	22f5ed51f1b01f7cf6993a556a0b763e437926fc (diff)
download	ComputeLibrary-7075fe2c5ee6f7cfe7cfd9454d905235e70b9ac4.tar.gz