From 7075fe2c5ee6f7cfe7cfd9454d905235e70b9ac4 Mon Sep 17 00:00:00 2001
From: Adnan AlSinan <adnan.alsinan@arm.com>
Date: Mon, 5 Jul 2021 13:12:52 +0100
Subject: Reorganize the kernels into nhwc, nchw and common folders

The Following kernels have been split into nchw/nhwc kernels files:

- batchnormalization_layer
- batch_to_space
- channel_shuffle
- depth_to_space
- dequantization_layer
- im2col
- normalization_layer
- normalize_planar_yuv_layer
- normalize_planar_yuv_layer_quantized
- pooling_layer
- pooling_layer_quantized
- remap
- reorg_layer
- scale
- scale_quantized
- space_to_batch
- space_to_depth
- upsample_layer
- winograd_filter_transform
- winograd_input_transform
- winograd_output_transform

The following kernels have been moved to nchw folder:
- direct_convolution1x1
- direct_convolution3x3
- direct_convolution5x5
- direct_convolution_quantized
- prior_box_layer

The following kernels have been moved to nhwc folder:
- direct_convolution
- dwc_native_fp_nhwc
- dwc_native_quantized_nhwc

The following kernels have been removed:
- sobel_filter

While the rest kerenls have been moved to the common folder.

Partially resolves COMPMID-4453

Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com>
Change-Id: Ic327ac935687ec351c610c65a3c6357f364a5a58
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5919
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
---
 src/core/CL/cl_kernels/activation_layer.cl         |   85 -
 src/core/CL/cl_kernels/activation_layer_quant.cl   |  162 -
 src/core/CL/cl_kernels/arg_min_max.cl              |  451 --
 src/core/CL/cl_kernels/batch_to_space.cl           |  232 --
 src/core/CL/cl_kernels/batchnormalization_layer.cl |  418 --
 src/core/CL/cl_kernels/bitwise_op.cl               |  218 -
 src/core/CL/cl_kernels/bounding_box_transform.cl   |  123 -
 .../cl_kernels/bounding_box_transform_quantized.cl |  110 -
 src/core/CL/cl_kernels/cast.cl                     |  134 -
 src/core/CL/cl_kernels/channel_shuffle.cl          |  222 -
 src/core/CL/cl_kernels/col2im.cl                   |  111 -
 src/core/CL/cl_kernels/common/activation_layer.cl  |   85 +
 .../CL/cl_kernels/common/activation_layer_quant.cl |  162 +
 src/core/CL/cl_kernels/common/arg_min_max.cl       |  451 ++
 .../cl_kernels/common/batchnormalization_layer.cl  |  183 +
 src/core/CL/cl_kernels/common/bitwise_op.cl        |  218 +
 .../CL/cl_kernels/common/bounding_box_transform.cl |  123 +
 .../common/bounding_box_transform_quantized.cl     |  110 +
 src/core/CL/cl_kernels/common/cast.cl              |  134 +
 src/core/CL/cl_kernels/common/col2im.cl            |  111 +
 src/core/CL/cl_kernels/common/comparisons.cl       |  150 +
 src/core/CL/cl_kernels/common/concatenate.cl       |  415 ++
 .../CL/cl_kernels/common/convert_fc_weights.cl     |   58 +
 src/core/CL/cl_kernels/common/convolution_layer.cl |  112 +
 src/core/CL/cl_kernels/common/copy_tensor.cl       |   72 +
 src/core/CL/cl_kernels/common/crop_tensor.cl       |   96 +
 .../CL/cl_kernels/common/deconvolution_layer.cl    |  130 +
 .../CL/cl_kernels/common/dequantization_layer.cl   |   90 +
 .../CL/cl_kernels/common/elementwise_operation.cl  |  150 +
 .../common/elementwise_operation_quantized.cl      |  138 +
 src/core/CL/cl_kernels/common/elementwise_unary.cl |   93 +
 src/core/CL/cl_kernels/common/fft.cl               | 1880 +++++++++
 src/core/CL/cl_kernels/common/fft_digit_reverse.cl |  154 +
 src/core/CL/cl_kernels/common/fft_scale.cl         |   81 +
 src/core/CL/cl_kernels/common/fill_border.cl       |  165 +
 src/core/CL/cl_kernels/common/floor.cl             |   68 +
 src/core/CL/cl_kernels/common/gather.cl            |   91 +
 src/core/CL/cl_kernels/common/gemm.cl              | 4386 ++++++++++++++++++++
 src/core/CL/cl_kernels/common/gemm_v1.cl           | 3243 +++++++++++++++
 src/core/CL/cl_kernels/common/gemmlowp.cl          | 2160 ++++++++++
 src/core/CL/cl_kernels/common/gemv.cl              |  200 +
 .../CL/cl_kernels/common/generate_proposals.cl     |   88 +
 .../common/generate_proposals_quantized.cl         |   87 +
 .../CL/cl_kernels/common/instance_normalization.cl |  254 ++
 src/core/CL/cl_kernels/common/l2_normalize.cl      |  189 +
 .../cl_kernels/common/mean_stddev_normalization.cl |  118 +
 src/core/CL/cl_kernels/common/memset.cl            |   67 +
 src/core/CL/cl_kernels/common/minmax_layer.cl      |  101 +
 src/core/CL/cl_kernels/common/nonmax.cl            |   70 +
 src/core/CL/cl_kernels/common/pad_layer.cl         |  263 ++
 src/core/CL/cl_kernels/common/permute.cl           |   74 +
 .../CL/cl_kernels/common/pixelwise_mul_float.cl    |  179 +
 src/core/CL/cl_kernels/common/pixelwise_mul_int.cl |  203 +
 src/core/CL/cl_kernels/common/pooling_layer.cl     |  390 ++
 .../cl_kernels/common/qlstm_layer_normalization.cl |  260 ++
 .../CL/cl_kernels/common/quantization_layer.cl     |  108 +
 src/core/CL/cl_kernels/common/range.cl             |  128 +
 .../CL/cl_kernels/common/reduction_operation.cl    |  460 ++
 src/core/CL/cl_kernels/common/reshape_layer.cl     |   70 +
 src/core/CL/cl_kernels/common/reverse.cl           |  102 +
 src/core/CL/cl_kernels/common/roi_align_layer.cl   |  200 +
 .../cl_kernels/common/roi_align_layer_quantized.cl |  206 +
 src/core/CL/cl_kernels/common/roi_pooling_layer.cl |  196 +
 src/core/CL/cl_kernels/common/select.cl            |  228 +
 src/core/CL/cl_kernels/common/slice_ops.cl         |  135 +
 src/core/CL/cl_kernels/common/softmax_layer.cl     |  531 +++
 .../cl_kernels/common/softmax_layer_quantized.cl   |  530 +++
 src/core/CL/cl_kernels/common/stack_layer.cl       |  113 +
 src/core/CL/cl_kernels/common/tile.cl              |   97 +
 src/core/CL/cl_kernels/common/transpose.cl         |  240 ++
 src/core/CL/cl_kernels/common/unpooling_layer.cl   |   72 +
 src/core/CL/cl_kernels/comparisons.cl              |  150 -
 src/core/CL/cl_kernels/concatenate.cl              |  415 --
 src/core/CL/cl_kernels/convert_fc_weights.cl       |   58 -
 src/core/CL/cl_kernels/convolution_layer.cl        |  112 -
 src/core/CL/cl_kernels/copy_tensor.cl              |   72 -
 src/core/CL/cl_kernels/crop_tensor.cl              |   96 -
 src/core/CL/cl_kernels/deconvolution_layer.cl      |  130 -
 src/core/CL/cl_kernels/depth_to_space.cl           |  111 -
 src/core/CL/cl_kernels/dequantization_layer.cl     |  212 -
 src/core/CL/cl_kernels/direct_convolution.cl       |  275 --
 src/core/CL/cl_kernels/direct_convolution1x1.cl    |  316 --
 src/core/CL/cl_kernels/direct_convolution3x3.cl    |  291 --
 src/core/CL/cl_kernels/direct_convolution5x5.cl    |  313 --
 .../CL/cl_kernels/direct_convolution_quantized.cl  |  308 --
 src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl       |  219 -
 .../CL/cl_kernels/dwc_native_quantized_nhwc.cl     |  284 --
 src/core/CL/cl_kernels/elementwise_operation.cl    |  150 -
 .../cl_kernels/elementwise_operation_quantized.cl  |  138 -
 src/core/CL/cl_kernels/elementwise_unary.cl        |   93 -
 src/core/CL/cl_kernels/fft.cl                      | 1880 ---------
 src/core/CL/cl_kernels/fft_digit_reverse.cl        |  154 -
 src/core/CL/cl_kernels/fft_scale.cl                |   81 -
 src/core/CL/cl_kernels/fill_border.cl              |  165 -
 src/core/CL/cl_kernels/floor.cl                    |   68 -
 src/core/CL/cl_kernels/gather.cl                   |   91 -
 src/core/CL/cl_kernels/gemm.cl                     | 4386 --------------------
 src/core/CL/cl_kernels/gemm_v1.cl                  | 3243 ---------------
 src/core/CL/cl_kernels/gemmlowp.cl                 | 2160 ----------
 src/core/CL/cl_kernels/gemv.cl                     |  200 -
 src/core/CL/cl_kernels/generate_proposals.cl       |   88 -
 .../CL/cl_kernels/generate_proposals_quantized.cl  |   87 -
 src/core/CL/cl_kernels/im2col.cl                   | 1360 ------
 src/core/CL/cl_kernels/instance_normalization.cl   |  254 --
 src/core/CL/cl_kernels/l2_normalize.cl             |  189 -
 .../CL/cl_kernels/mean_stddev_normalization.cl     |  118 -
 src/core/CL/cl_kernels/memset.cl                   |   67 -
 src/core/CL/cl_kernels/minmax_layer.cl             |  101 -
 src/core/CL/cl_kernels/nchw/batch_to_space.cl      |  131 +
 .../CL/cl_kernels/nchw/batchnormalization_layer.cl |  147 +
 src/core/CL/cl_kernels/nchw/channel_shuffle.cl     |  103 +
 src/core/CL/cl_kernels/nchw/depth_to_space.cl      |   69 +
 .../CL/cl_kernels/nchw/dequantization_layer.cl     |   86 +
 .../CL/cl_kernels/nchw/direct_convolution1x1.cl    |  316 ++
 .../CL/cl_kernels/nchw/direct_convolution3x3.cl    |  291 ++
 .../CL/cl_kernels/nchw/direct_convolution5x5.cl    |  313 ++
 .../nchw/direct_convolution_quantized.cl           |  308 ++
 src/core/CL/cl_kernels/nchw/im2col.cl              |  863 ++++
 src/core/CL/cl_kernels/nchw/normalization_layer.cl |  175 +
 .../cl_kernels/nchw/normalize_planar_yuv_layer.cl  |   82 +
 .../nchw/normalize_planar_yuv_layer_quantized.cl   |  101 +
 src/core/CL/cl_kernels/nchw/pooling_layer.cl       |  331 ++
 .../CL/cl_kernels/nchw/pooling_layer_quantized.cl  |  142 +
 src/core/CL/cl_kernels/nchw/prior_box_layer.cl     |  139 +
 src/core/CL/cl_kernels/nchw/remap.cl               |  133 +
 src/core/CL/cl_kernels/nchw/reorg_layer.cl         |   75 +
 src/core/CL/cl_kernels/nchw/scale.cl               |  148 +
 src/core/CL/cl_kernels/nchw/scale_quantized.cl     |   86 +
 src/core/CL/cl_kernels/nchw/space_to_batch.cl      |  156 +
 src/core/CL/cl_kernels/nchw/space_to_depth.cl      |   69 +
 src/core/CL/cl_kernels/nchw/upsample_layer.cl      |   79 +
 .../cl_kernels/nchw/winograd_filter_transform.cl   |  911 ++++
 .../CL/cl_kernels/nchw/winograd_input_transform.cl | 1346 ++++++
 .../cl_kernels/nchw/winograd_output_transform.cl   | 1082 +++++
 src/core/CL/cl_kernels/nhwc/batch_to_space.cl      |  131 +
 .../CL/cl_kernels/nhwc/batchnormalization_layer.cl |  146 +
 src/core/CL/cl_kernels/nhwc/channel_shuffle.cl     |  160 +
 src/core/CL/cl_kernels/nhwc/depth_to_space.cl      |   69 +
 .../CL/cl_kernels/nhwc/dequantization_layer.cl     |   87 +
 src/core/CL/cl_kernels/nhwc/direct_convolution.cl  |  275 ++
 src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl  |  219 +
 .../cl_kernels/nhwc/dwc_native_quantized_nhwc.cl   |  284 ++
 src/core/CL/cl_kernels/nhwc/im2col.cl              |  532 +++
 src/core/CL/cl_kernels/nhwc/normalization_layer.cl |  177 +
 .../cl_kernels/nhwc/normalize_planar_yuv_layer.cl  |   81 +
 .../nhwc/normalize_planar_yuv_layer_quantized.cl   |   96 +
 src/core/CL/cl_kernels/nhwc/pooling_layer.cl       |  364 ++
 .../CL/cl_kernels/nhwc/pooling_layer_quantized.cl  |  164 +
 src/core/CL/cl_kernels/nhwc/remap.cl               |  180 +
 src/core/CL/cl_kernels/nhwc/reorg_layer.cl         |   76 +
 src/core/CL/cl_kernels/nhwc/scale.cl               |  174 +
 src/core/CL/cl_kernels/nhwc/scale_quantized.cl     |  124 +
 src/core/CL/cl_kernels/nhwc/space_to_batch.cl      |  155 +
 src/core/CL/cl_kernels/nhwc/space_to_depth.cl      |   69 +
 src/core/CL/cl_kernels/nhwc/upsample_layer.cl      |   80 +
 .../cl_kernels/nhwc/winograd_filter_transform.cl   | 1075 +++++
 .../CL/cl_kernels/nhwc/winograd_input_transform.cl |  953 +++++
 .../cl_kernels/nhwc/winograd_output_transform.cl   | 1030 +++++
 src/core/CL/cl_kernels/nonmax.cl                   |   70 -
 src/core/CL/cl_kernels/normalization_layer.cl      |  319 --
 .../CL/cl_kernels/normalize_planar_yuv_layer.cl    |  134 -
 .../normalize_planar_yuv_layer_quantized.cl        |  166 -
 src/core/CL/cl_kernels/pad_layer.cl                |  263 --
 src/core/CL/cl_kernels/permute.cl                  |   74 -
 src/core/CL/cl_kernels/pixelwise_mul_float.cl      |  179 -
 src/core/CL/cl_kernels/pixelwise_mul_int.cl        |  203 -
 src/core/CL/cl_kernels/pooling_layer.cl            |  981 -----
 src/core/CL/cl_kernels/pooling_layer_quantized.cl  |  266 --
 src/core/CL/cl_kernels/prior_box_layer.cl          |  139 -
 .../CL/cl_kernels/qlstm_layer_normalization.cl     |  260 --
 src/core/CL/cl_kernels/quantization_layer.cl       |  108 -
 src/core/CL/cl_kernels/range.cl                    |  128 -
 src/core/CL/cl_kernels/reduction_operation.cl      |  460 --
 src/core/CL/cl_kernels/remap.cl                    |  286 --
 src/core/CL/cl_kernels/reorg_layer.cl              |  116 -
 src/core/CL/cl_kernels/reshape_layer.cl            |   70 -
 src/core/CL/cl_kernels/reverse.cl                  |  102 -
 src/core/CL/cl_kernels/roi_align_layer.cl          |  200 -
 .../CL/cl_kernels/roi_align_layer_quantized.cl     |  206 -
 src/core/CL/cl_kernels/roi_pooling_layer.cl        |  196 -
 src/core/CL/cl_kernels/scale.cl                    |  297 --
 src/core/CL/cl_kernels/scale_quantized.cl          |  185 -
 src/core/CL/cl_kernels/select.cl                   |  228 -
 src/core/CL/cl_kernels/slice_ops.cl                |  135 -
 src/core/CL/cl_kernels/sobel_filter.cl             |  541 ---
 src/core/CL/cl_kernels/softmax_layer.cl            |  531 ---
 src/core/CL/cl_kernels/softmax_layer_quantized.cl  |  530 ---
 src/core/CL/cl_kernels/space_to_batch.cl           |  280 --
 src/core/CL/cl_kernels/space_to_depth.cl           |  111 -
 src/core/CL/cl_kernels/stack_layer.cl              |  113 -
 src/core/CL/cl_kernels/tile.cl                     |   97 -
 src/core/CL/cl_kernels/transpose.cl                |  240 --
 src/core/CL/cl_kernels/unpooling_layer.cl          |   72 -
 src/core/CL/cl_kernels/upsample_layer.cl           |  135 -
 .../CL/cl_kernels/winograd_filter_transform.cl     | 1952 ---------
 src/core/CL/cl_kernels/winograd_input_transform.cl | 2233 ----------
 .../CL/cl_kernels/winograd_output_transform.cl     | 2063 ---------
 197 files changed, 35651 insertions(+), 35370 deletions(-)
 delete mode 100644 src/core/CL/cl_kernels/activation_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/activation_layer_quant.cl
 delete mode 100644 src/core/CL/cl_kernels/arg_min_max.cl
 delete mode 100644 src/core/CL/cl_kernels/batch_to_space.cl
 delete mode 100644 src/core/CL/cl_kernels/batchnormalization_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/bitwise_op.cl
 delete mode 100644 src/core/CL/cl_kernels/bounding_box_transform.cl
 delete mode 100644 src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/cast.cl
 delete mode 100644 src/core/CL/cl_kernels/channel_shuffle.cl
 delete mode 100644 src/core/CL/cl_kernels/col2im.cl
 create mode 100644 src/core/CL/cl_kernels/common/activation_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/activation_layer_quant.cl
 create mode 100644 src/core/CL/cl_kernels/common/arg_min_max.cl
 create mode 100644 src/core/CL/cl_kernels/common/batchnormalization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/bitwise_op.cl
 create mode 100644 src/core/CL/cl_kernels/common/bounding_box_transform.cl
 create mode 100644 src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/common/cast.cl
 create mode 100644 src/core/CL/cl_kernels/common/col2im.cl
 create mode 100644 src/core/CL/cl_kernels/common/comparisons.cl
 create mode 100644 src/core/CL/cl_kernels/common/concatenate.cl
 create mode 100644 src/core/CL/cl_kernels/common/convert_fc_weights.cl
 create mode 100644 src/core/CL/cl_kernels/common/convolution_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/copy_tensor.cl
 create mode 100644 src/core/CL/cl_kernels/common/crop_tensor.cl
 create mode 100644 src/core/CL/cl_kernels/common/deconvolution_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/dequantization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/elementwise_operation.cl
 create mode 100644 src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/common/elementwise_unary.cl
 create mode 100644 src/core/CL/cl_kernels/common/fft.cl
 create mode 100644 src/core/CL/cl_kernels/common/fft_digit_reverse.cl
 create mode 100644 src/core/CL/cl_kernels/common/fft_scale.cl
 create mode 100644 src/core/CL/cl_kernels/common/fill_border.cl
 create mode 100644 src/core/CL/cl_kernels/common/floor.cl
 create mode 100644 src/core/CL/cl_kernels/common/gather.cl
 create mode 100644 src/core/CL/cl_kernels/common/gemm.cl
 create mode 100644 src/core/CL/cl_kernels/common/gemm_v1.cl
 create mode 100644 src/core/CL/cl_kernels/common/gemmlowp.cl
 create mode 100644 src/core/CL/cl_kernels/common/gemv.cl
 create mode 100644 src/core/CL/cl_kernels/common/generate_proposals.cl
 create mode 100644 src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/common/instance_normalization.cl
 create mode 100644 src/core/CL/cl_kernels/common/l2_normalize.cl
 create mode 100644 src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
 create mode 100644 src/core/CL/cl_kernels/common/memset.cl
 create mode 100644 src/core/CL/cl_kernels/common/minmax_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/nonmax.cl
 create mode 100644 src/core/CL/cl_kernels/common/pad_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/permute.cl
 create mode 100644 src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
 create mode 100644 src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
 create mode 100644 src/core/CL/cl_kernels/common/pooling_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
 create mode 100644 src/core/CL/cl_kernels/common/quantization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/range.cl
 create mode 100644 src/core/CL/cl_kernels/common/reduction_operation.cl
 create mode 100644 src/core/CL/cl_kernels/common/reshape_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/reverse.cl
 create mode 100644 src/core/CL/cl_kernels/common/roi_align_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/common/roi_pooling_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/select.cl
 create mode 100644 src/core/CL/cl_kernels/common/slice_ops.cl
 create mode 100644 src/core/CL/cl_kernels/common/softmax_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/common/stack_layer.cl
 create mode 100644 src/core/CL/cl_kernels/common/tile.cl
 create mode 100644 src/core/CL/cl_kernels/common/transpose.cl
 create mode 100644 src/core/CL/cl_kernels/common/unpooling_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/comparisons.cl
 delete mode 100644 src/core/CL/cl_kernels/concatenate.cl
 delete mode 100644 src/core/CL/cl_kernels/convert_fc_weights.cl
 delete mode 100644 src/core/CL/cl_kernels/convolution_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/copy_tensor.cl
 delete mode 100644 src/core/CL/cl_kernels/crop_tensor.cl
 delete mode 100644 src/core/CL/cl_kernels/deconvolution_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/depth_to_space.cl
 delete mode 100644 src/core/CL/cl_kernels/dequantization_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/direct_convolution.cl
 delete mode 100644 src/core/CL/cl_kernels/direct_convolution1x1.cl
 delete mode 100644 src/core/CL/cl_kernels/direct_convolution3x3.cl
 delete mode 100644 src/core/CL/cl_kernels/direct_convolution5x5.cl
 delete mode 100644 src/core/CL/cl_kernels/direct_convolution_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
 delete mode 100644 src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
 delete mode 100644 src/core/CL/cl_kernels/elementwise_operation.cl
 delete mode 100644 src/core/CL/cl_kernels/elementwise_operation_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/elementwise_unary.cl
 delete mode 100644 src/core/CL/cl_kernels/fft.cl
 delete mode 100644 src/core/CL/cl_kernels/fft_digit_reverse.cl
 delete mode 100644 src/core/CL/cl_kernels/fft_scale.cl
 delete mode 100644 src/core/CL/cl_kernels/fill_border.cl
 delete mode 100644 src/core/CL/cl_kernels/floor.cl
 delete mode 100644 src/core/CL/cl_kernels/gather.cl
 delete mode 100644 src/core/CL/cl_kernels/gemm.cl
 delete mode 100644 src/core/CL/cl_kernels/gemm_v1.cl
 delete mode 100644 src/core/CL/cl_kernels/gemmlowp.cl
 delete mode 100644 src/core/CL/cl_kernels/gemv.cl
 delete mode 100644 src/core/CL/cl_kernels/generate_proposals.cl
 delete mode 100644 src/core/CL/cl_kernels/generate_proposals_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/im2col.cl
 delete mode 100644 src/core/CL/cl_kernels/instance_normalization.cl
 delete mode 100644 src/core/CL/cl_kernels/l2_normalize.cl
 delete mode 100644 src/core/CL/cl_kernels/mean_stddev_normalization.cl
 delete mode 100644 src/core/CL/cl_kernels/memset.cl
 delete mode 100644 src/core/CL/cl_kernels/minmax_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/batch_to_space.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/channel_shuffle.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/depth_to_space.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/dequantization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/im2col.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/normalization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/pooling_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/prior_box_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/remap.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/reorg_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/scale.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/scale_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/space_to_batch.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/space_to_depth.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/upsample_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
 create mode 100644 src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/batch_to_space.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/depth_to_space.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/direct_convolution.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/im2col.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/normalization_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/pooling_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/remap.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/reorg_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/scale.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/scale_quantized.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/space_to_batch.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/space_to_depth.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/upsample_layer.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
 create mode 100644 src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
 delete mode 100644 src/core/CL/cl_kernels/nonmax.cl
 delete mode 100644 src/core/CL/cl_kernels/normalization_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/pad_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/permute.cl
 delete mode 100644 src/core/CL/cl_kernels/pixelwise_mul_float.cl
 delete mode 100644 src/core/CL/cl_kernels/pixelwise_mul_int.cl
 delete mode 100644 src/core/CL/cl_kernels/pooling_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/pooling_layer_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/prior_box_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/qlstm_layer_normalization.cl
 delete mode 100644 src/core/CL/cl_kernels/quantization_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/range.cl
 delete mode 100644 src/core/CL/cl_kernels/reduction_operation.cl
 delete mode 100644 src/core/CL/cl_kernels/remap.cl
 delete mode 100644 src/core/CL/cl_kernels/reorg_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/reshape_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/reverse.cl
 delete mode 100644 src/core/CL/cl_kernels/roi_align_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/roi_align_layer_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/roi_pooling_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/scale.cl
 delete mode 100644 src/core/CL/cl_kernels/scale_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/select.cl
 delete mode 100644 src/core/CL/cl_kernels/slice_ops.cl
 delete mode 100644 src/core/CL/cl_kernels/sobel_filter.cl
 delete mode 100644 src/core/CL/cl_kernels/softmax_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/softmax_layer_quantized.cl
 delete mode 100644 src/core/CL/cl_kernels/space_to_batch.cl
 delete mode 100644 src/core/CL/cl_kernels/space_to_depth.cl
 delete mode 100644 src/core/CL/cl_kernels/stack_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/tile.cl
 delete mode 100644 src/core/CL/cl_kernels/transpose.cl
 delete mode 100644 src/core/CL/cl_kernels/unpooling_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/upsample_layer.cl
 delete mode 100644 src/core/CL/cl_kernels/winograd_filter_transform.cl
 delete mode 100644 src/core/CL/cl_kernels/winograd_input_transform.cl
 delete mode 100644 src/core/CL/cl_kernels/winograd_output_transform.cl

(limited to 'src/core/CL/cl_kernels')

diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/activation_layer.cl
deleted file mode 100644
index bc2c99b6c8..0000000000
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#if defined(ACT) && defined(DATA_TYPE) && defined(VEC_SIZE)
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#include "activation_float_helpers.h"
-
-/** This performs an activation function floating point inputs.
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void activation_layer(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    // Get pixels pointer
-    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_addr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-
-    // Load data
-    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-
-    // Perform activation
-    data0 = ACTIVATION(ACT, DATA_TYPE, VEC_SIZE, data0, A_VAL, B_VAL);
-
-    // Store result
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#endif /* defined(ACT) */
diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/activation_layer_quant.cl
deleted file mode 100644
index 66261019ab..0000000000
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "activation_quant_helpers.h"
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-
-#if defined(FLOAT_DOMAIN)
-// Activations performed in the float domain
-
-#include "activation_float_helpers.h"
-
-/** This performs an activation function on quantized inputs with float transformations.
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
- * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
- * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void activation_layer_quant_f32(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    // Get pixels pointer
-    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_addr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-
-    // Load data
-    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-
-    VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT);
-#if defined(O1_VAL)
-    data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
-#else  // defined(O1_VAL)
-    data_flt                    = round(data_flt) * ((float)S1_VAL);
-#endif // defined(O1_VAL)
-    data_flt = ACTIVATION(ACT, float, VEC_SIZE, data_flt, A_VAL, B_VAL);
-
-#if defined(O2_VAL)
-    data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
-#else  // defined(O2_VAL)
-    data0                       = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
-#endif // defined(O2_VAL)
-
-    // Store result
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#else // defined(FLOAT_DOMAIN)
-// Activations performed in the quantized domain
-
-#if defined(ACT)
-/** This performs an activation function on quantized inputs.
- *
- * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
- * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
- * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void activation_layer_quant(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    // Get pixels pointer
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_addr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-
-    // Load data
-    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-
-    data0 = PERFORM_ACTIVATION_QUANT(ACT, data0);
-
-    // Store result
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(ACT)
-#endif // defined(FLOAT_DOMAIN)
diff --git a/src/core/CL/cl_kernels/arg_min_max.cl b/src/core/CL/cl_kernels/arg_min_max.cl
deleted file mode 100644
index 6e57ed0af1..0000000000
--- a/src/core/CL/cl_kernels/arg_min_max.cl
+++ /dev/null
@@ -1,451 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
-
-#define VEC_TYPE_IN VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define VEC_TYPE_OUT VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
-#define VEC_SELECT_IN SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define VEC_SIGNED_INT_IN SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) (VEC_SELECT_IN) isgreater(x, y)
-#define ISLESS(x, y) (VEC_SELECT_IN) isless(x, y)
-#else // !FLOAT_DATA_TYPE
-#if defined(WIDTH)
-#define ISGREATER(x, y) (x > y) ? 1 : 0
-#define ISLESS(x, y) (x < y) ? 1 : 0
-#else // !defined(WIDTH)
-#define ISGREATER(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x > y))
-#define ISLESS(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x < y))
-#endif // defined(WIDTH)
-#endif // defined(FLOAT_DATA_TYPE)
-
-#if defined(ARG_MAX)
-#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
-#elif defined(ARG_MIN)
-#define CONDITION_TO_USE(x, y) ISLESS(x, y)
-#else // !(defined(ARG_MAX) || defined(ARG_MIN))
-#error "Unsupported reduction operation!"
-#endif // defined(ARG_MAX)
-
-#if defined(WIDTH)
-#if defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
-{
-    int end_elem = (x_idx + 1) * 16;
-    if(end_elem > WIDTH)
-    {
-        end_elem = WIDTH - x_idx * 16;
-    }
-    DATA_TYPE_OUTPUT res = prev_res[0];
-    for(int x_v = 1; x_v < end_elem; ++x_v)
-    {
-        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < * (input + res));
-    }
-    return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index minimum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-    DATA_TYPE_OUTPUT res = 0;
-    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-    {
-        res = select(res, x_v, *(input + x_v) < * (input + res));
-    }
-    return res;
-#else  // WIDTH >= 16
-    int       x_elem   = x_idx * 16;
-    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-    x_elem -= x_goback;
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input - x_goback);
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-    SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 8)
-    idx_sel       = (in.s01234567 <= in.s89abcdef);
-    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
-    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-    idx_sel.s0123 = (in.s0123 < in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 4)));
-    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
-    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-    idx_sel.s01 = (in.s01 < in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 2)));
-    in.s01      = select(in.s23, in.s01, idx_sel.s01);
-    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-    idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), SIGNED_INT_DATA_TYPE(DATA_TYPE)));
-    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-    return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MIN)
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
-{
-    int end_elem = (x_idx + 1) * 16;
-    if(end_elem > WIDTH)
-    {
-        end_elem = WIDTH - x_idx * 16;
-    }
-    DATA_TYPE_OUTPUT res = prev_res[0];
-    for(int x_v = 1; x_v < end_elem; ++x_v)
-    {
-        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) > *(input + res));
-    }
-    return res;
-}
-#else // !defined(PREV_OUTPUT)
-/** Find index maximum value of a vector
- *
- * @param[in] input Pointer to the first value.
- *
- * @return index of the vector.
- */
-inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
-{
-#if WIDTH < 16
-    DATA_TYPE_OUTPUT res = 0;
-    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
-    {
-        res = select(res, x_v, *(input + x_v) > *(input + res));
-    }
-    return res;
-#else  // WIDTH >= 16
-    int       x_elem   = x_idx * 16;
-    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
-    x_elem -= x_goback;
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    in = vload16(0, input - x_goback);
-    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
-    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
-
-    SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 8)
-    idx_sel       = (in.s01234567 >= in.s89abcdef);
-    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
-    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
-
-    idx_sel.s0123 = (in.s0123 > in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 4)));
-    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
-    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
-
-    idx_sel.s01 = (in.s01 > in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 2)));
-    in.s01      = select(in.s23, in.s01, idx_sel.s01);
-    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
-
-    idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), SIGNED_INT_DATA_TYPE(DATA_TYPE)));
-    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
-
-    return res.s0 + x_elem;
-#endif // WIDTH < 16
-}
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX)
-
-/** This kernel performs parallel reduction given an operation on x-axis.
- *
- * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed using -DPREV_OUTPUT
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
- * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the ArgMax
- * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the ArgMin
- *
- * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
- * @param[in] prev_res_ptr                              (Optional) Pointer to previous results tensor. Supported data types: U32/S32
- * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X dimension (in bytes)
- * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y dimension (in bytes)
- * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element in the previous results tensor
- * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: U32/S32
- * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] local_results                             Local buffer for storing the partial result
- */
-__kernel void arg_min_max_x(
-    IMAGE_DECLARATION(src),
-#if defined(PREV_OUTPUT)
-    IMAGE_DECLARATION(prev_res),
-#endif // defined(PREV_OUTPUT)
-    IMAGE_DECLARATION(partial_res),
-    __local DATA_TYPE_OUTPUT *local_results)
-{
-#if defined(PREV_OUTPUT)
-    Image src      = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
-    Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
-#else  // !defined(PREV_OUTPUT)
-    Image src                      = CONVERT_TO_IMAGE_STRUCT(src);
-#endif // defined(PREV_OUTPUT)
-    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
-
-    unsigned int lsize = get_local_size(0);
-    unsigned int lid   = get_local_id(0);
-
-    const uint     x_idx                 = get_global_id(0);
-    const uint     y_idx                 = get_global_id(1);
-    const __global DATA_TYPE *src_in_row = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
-
-    for(unsigned int y = 0; y < get_local_size(1); ++y)
-    {
-#if defined(ARG_MAX)
-#if defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#else  // defined(ARG_MIN)
-#if defined(PREV_OUTPUT)
-        local_results[lid]         = arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
-#else  // !defined(PREV_OUTPUT)
-        local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
-#endif // defined(PREV_OUTPUT)
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-
-        barrier(CLK_LOCAL_MEM_FENCE);
-
-        // Looking for the next highest power of 2 (maximum value of lsize is 8)
-        unsigned int middle = lsize - 1;
-        middle |= middle >> 1;
-        middle |= middle >> 2;
-        middle += 1;
-        // Perform parallel reduction
-        for(unsigned int i = middle; i > 0; i >>= 1)
-        {
-            if(lid < i && lid + i < lsize)
-            {
-                DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
-                DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
-#if defined(ARG_MAX)
-                local_results[lid] = select(
-                                         local_results[lid],
-                                         local_results[lid + i],
-                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1));
-#else  // defined(ARG_MIN)
-                local_results[lid] = select(
-                                         local_results[lid],
-                                         local_results[lid + i],
-                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
-#endif // defined(ARG_MAX) || defined(ARG_MIN)
-            }
-            barrier(CLK_LOCAL_MEM_FENCE);
-        }
-
-        if(lid == 0)
-        {
-            ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
-        }
-    }
-}
-#endif // defined(WIDTH)
-
-#if defined(HEIGHT)
-/** This kernel performs reduction on y-axis.
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
- * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_y(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y;
-
-    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
-
-    VEC_TYPE_OUT indx0 = 0;
-    for(DATA_TYPE_OUTPUT y = 1; y < HEIGHT; ++y)
-    {
-        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_TYPE_IN);
-
-        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
-        indx0                  = select(indx0, (VEC_TYPE_OUT)y, cond_conv);
-        res                    = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(HEIGHT)
-
-#if defined(DEPTH) && !defined(BATCH)
-/** This kernel performs reduction on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-
-    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
-
-    VEC_TYPE_OUT indx0 = 0;
-    for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
-    {
-        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_TYPE_IN);
-
-        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
-        indx0                  = select(indx0, (VEC_TYPE_OUT)z, cond_conv);
-        res                    = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DEPTH)  && !defined(BATCH) */
-
-#if defined(BATCH) && defined(DEPTH)
-/** This kernel performs reduction on w-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void arg_min_max_w(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + (get_global_id(2) % DEPTH) * input_stride_z +
-                                  (get_global_id(2) / DEPTH) * input_stride_w;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + (get_global_id(
-                                      2) % DEPTH) * output_stride_z + (get_global_id(2) / DEPTH) * output_stride_w;
-
-    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
-
-    VEC_TYPE_OUT indx0 = 0;
-    for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
-    {
-        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_TYPE_IN);
-
-        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
-        indx0                  = select(indx0, (VEC_TYPE_OUT)w, cond_conv);
-        res                    = select(res, in, CONDITION_TO_USE(in, res));
-    }
-
-    // Store result
-    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(BATCH) && defined(DEPTH) */
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl
deleted file mode 100644
index 8a71985b02..0000000000
--- a/src/core/CL/cl_kernels/batch_to_space.cl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    VECTOR_DECLARATION(block_shape),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
-    const int block_x = *((__global int *)vector_offset(&block, 0));
-    const int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    VECTOR_DECLARATION(block_shape),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
-    const int block_x = *((__global int *)vector_offset(&block, 0));
-    const int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int block_x = BLOCK_SHAPE_X;
-    const int block_y = BLOCK_SHAPE_Y;
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int block_x = BLOCK_SHAPE_X;
-    const int block_y = BLOCK_SHAPE_Y;
-
-    const int r = (BATCH_SIZE / (block_x * block_y));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0);
-    const int w = batch_id % r;
-
-    const int out_x = x * block_x + (batch_id / r) % block_x;
-    const int out_y = y * block_y + (batch_id / r) / block_x;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
deleted file mode 100644
index 89cbe4440e..0000000000
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define INVSQRT_OP(a) rsqrt((a))
-#define SQCVT_SAT(a) (a)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-
-/** Apply batch normalization.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
- * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
-                                            TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
-                                            VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-                                            VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
-                                            float epsilon)
-{
-    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D out = in;
-#else  /* IN_PLACE */
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
-#ifndef USE_DEFAULT_BETA
-    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
-#endif /* USE_DEFAULT_GAMMA */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    denominator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    numerator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    x_bar = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res = 0;
-
-    const int current_slice = get_global_id(2);
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
-    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
-    // Calculate x bar and store results
-    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
-    numerator = SUB_OP(data, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
-
-    res = MUL_OP(gamma_vec, x_bar);
-#else  /* USE_DEFAULT_GAMMA */
-    // gamma is equal to 1, no need to perform multiplications
-    res                         = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
-    // beta is not zero, hence we need to perform the addition
-    res = ADD_OP(res, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
-    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Apply batch normalization on tensors with NHWC format.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
- * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
- * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
-                                            TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
-                                            VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-                                            VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
-                                            float epsilon)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
-    __global uchar *output_addr = input_ptr;
-#else  /* IN_PLACE */
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
-    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
-    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
-#ifndef USE_DEFAULT_BETA
-    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
-    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_GAMMA */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    denominator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    numerator = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    x_bar = 0;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = 0;
-
-    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
-    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
-    // Calculate x bar and store results
-    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
-    numerator = SUB_OP(data, numerator);
-    x_bar     = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
-
-    res0 = MUL_OP(gamma_vec, x_bar);
-#else  /* USE_DEFAULT_GAMMA */
-    // gamma is equal to 1, no need to perform multiplications
-    res0 = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
-    // beta is not zero, hence we need to perform the addition
-    res0 = ADD_OP(res0, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
-
-#if defined(DATA_TYPE) && defined(EPSILON)
-/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
- *
- * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
- * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
- *       For depthwise convolution weight do not pass DIM2
- * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
- * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
- *
- * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
- * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
- * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
- * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
- * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
- * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
- * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
- * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
- * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
- * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
- * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
- * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
- * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
- * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
- * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
- * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
- * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
- * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
- * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
- * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
- * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
- * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
- * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
- */
-__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
-#if defined(BIAS)
-                                            VECTOR_DECLARATION(b),
-#endif // defined(BIAS)
-                                            VECTOR_DECLARATION(mean),
-                                            VECTOR_DECLARATION(var)
-#ifndef IN_PLACE_W
-                                            ,
-                                            TENSOR3D_DECLARATION(w_fused)
-#endif // ifndef IN_PLACE_W
-#ifndef IN_PLACE_B
-                                            ,
-                                            VECTOR_DECLARATION(b_fused)
-#endif // ifndef IN_PLACE_B
-#if defined(BETA)
-                                            ,
-                                            VECTOR_DECLARATION(beta)
-#endif // defined(BETA)
-#if defined(GAMMA)
-                                            ,
-                                            VECTOR_DECLARATION(gamma)
-#endif // defined(GAMMA)
-                                           )
-{
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-#if defined(DIM2)
-    int c0 = z % DIM2;
-    int c1 = z / DIM2;
-#else // ! defined(DIM2)
-    int c0                                                                                    = 0;
-#if defined(NHWC)
-    int c1                                                                                    = x;
-#else  // defined(NHWC)
-    int c1 = z;
-#endif // defined(NHWC)
-#endif // defined(DIM2)
-
-    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
-    int v_offset = c1 * sizeof(DATA_TYPE);
-
-    DATA_TYPE w_old = 0.0f;
-    DATA_TYPE b_old = 0.0f;
-    DATA_TYPE w_new = 0.0f;
-    DATA_TYPE b_new = 0.0f;
-    DATA_TYPE gamma = 1.0f;
-    DATA_TYPE mean  = 0.0f;
-    DATA_TYPE var   = 1.0f;
-    DATA_TYPE beta  = 0.0f;
-
-    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
-    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
-    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
-
-#if defined(GAMMA)
-    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
-#endif // defined(GAMMA)
-
-    // Compute new weight
-    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
-
-#if defined(IN_PLACE_W)
-    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
-#else  // defined(IN_PLACE_W)
-    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
-#endif // defined(IN_PLACE_W)
-
-    // Compute bias
-#if !defined(DIM2) && defined(NHWC)
-    if(z == 0 && y == 0)
-#else  // !defined(DIM2) && defined(NHWC)
-    if(x == 0 && y == 0 && c0 == 0)
-#endif // !defined(DIM2) && defined(NHWC)
-    {
-#if defined(BIAS)
-        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
-#endif // defined(BIAS)
-#if defined(BETA)
-        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
-#endif // defined(BETA)
-
-        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
-
-#if defined(BIAS)
-
-#if defined(IN_PLACE_B)
-        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
-#else  // defined(IN_PLACE_B)
-        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // defined(IN_PLACE_B)
-
-#else // defined(BIAS)
-
-#ifndef IN_PLACE_B
-        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // ifndef IN_PLACE_B
-
-#endif // defined(BIAS)
-    }
-}
-#endif // defined(DATA_TYPE) && defined(EPSILON)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/bitwise_op.cl
deleted file mode 100644
index a600bced9e..0000000000
--- a/src/core/CL/cl_kernels/bitwise_op.cl
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-/** This function computes the bitwise OR of two input images.
- *
- * @note The following variables must be passed at compile time:
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void bitwise_or(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
-
-    // Load data
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
-
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    data0 = in_a | in_b;
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** This function computes the bitwise AND of two input images.
- *
- * @note The following variables must be passed at compile time:
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void bitwise_and(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
-
-    // Load data
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
-
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    data0 = in_a & in_b;
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** This function computes the bitwise XOR of two input images.
- *
- * @note The following variables must be passed at compile time:
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void bitwise_xor(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(in2),
-    IMAGE_DECLARATION(out))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
-
-    // Load data
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
-
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    data0 = in_a ^ in_b;
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** This function computes the bitwise NOT of an images.
- *
- * @note The following variables must be passed at compile time:
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void bitwise_not(
-    IMAGE_DECLARATION(in1),
-    IMAGE_DECLARATION(out))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
-
-    // Load data
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
-
-    VEC_DATA_TYPE(uchar, VEC_SIZE)
-    data0 = ~in_a;
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/bounding_box_transform.cl
deleted file mode 100644
index f2e9cb0ed0..0000000000
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) // Check for compile time constants
-
-/** Transform proposal bounding boxes to target bounding box using bounding box deltas.
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
- * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
- * -# -DIMG_WIDTH= Original image width
- * -# -DIMG_HEIGHT= Original image height
- * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
- *
- * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: F16/F32
- * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
- * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
- * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
- * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
- * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
- * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
- * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
- * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
- * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: same as @p in_ptr
- * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
- * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
- * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
- */
-__kernel void bounding_box_transform(
-    VECTOR_DECLARATION(boxes),
-    IMAGE_DECLARATION(pred_boxes),
-    IMAGE_DECLARATION(deltas))
-{
-    // Get pixels pointer
-    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
-    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
-    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
-
-    // Load delta and box values into registers
-    const DATA_TYPE one     = (DATA_TYPE)1.f;
-    const DATA_TYPE halfone = (DATA_TYPE)0.5f;
-
-    const int py = get_global_id(1); // box
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    scale_before = (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_BEFORE;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    delta = vload4(0, (__global DATA_TYPE *)deltas.ptr);
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    box = vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)) / scale_before;
-
-    // Calculate width and centers of the old boxes
-    const VEC_DATA_TYPE(DATA_TYPE, 2)
-    dims = box.s23 - box.s01 + one;
-    const VEC_DATA_TYPE(DATA_TYPE, 2)
-    ctr = box.s01 + halfone * dims;
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    weights = (VEC_DATA_TYPE(DATA_TYPE, 4))(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
-    delta /= weights;
-    delta.s23 = min(delta.s23, (DATA_TYPE)BBOX_XFORM_CLIP);
-
-    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
-    const VEC_DATA_TYPE(DATA_TYPE, 2)
-    pred_ctr = delta.s01 * dims + ctr;
-    const VEC_DATA_TYPE(DATA_TYPE, 2)
-    pred_dims = exp(delta.s23) * dims;
-
-    // Useful vector constant definitions
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    max_values = (VEC_DATA_TYPE(DATA_TYPE, 4))(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    sign = (VEC_DATA_TYPE(DATA_TYPE, 4))(-1, -1, 1, 1);
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    min_values = 0;
-
-    // Calculate the coordinates of the new boxes
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
-#ifdef OFFSET // Possibly adjust the predicted boxes
-    pred_box.s23 -= one;
-#endif // Possibly adjust the predicted boxes
-    pred_box = CLAMP(pred_box, min_values, max_values);
-#ifdef SCALE_AFTER // Possibly scale the predicted boxes
-    pred_box *= (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_AFTER;
-#endif // Possibly scale the predicted boxes
-
-    // Store them into the output
-    vstore4(pred_box, 0, (__global DATA_TYPE *)pred_boxes.ptr);
-}
-
-#endif // defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE)
diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
deleted file mode 100644
index c1d45a56b9..0000000000
--- a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#if defined(DATA_TYPE) && defined(DATA_TYPE_DELTAS) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) && defined(OFFSET_BOXES) && defined(SCALE_BOXES) && defined(OFFSET_DELTAS) && defined(SCALE_DELTAS) && defined(OFFSET_PRED_BOXES) && defined(SCALE_PRED_BOXES) // Check for compile time constants
-
-/** Transform proposal bounding boxes to target bounding box using bounding box deltas for quantized data types.
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM16 for boxes and pred_boxes, QASYMM8 for for deltas
- * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
- * -# -DIMG_WIDTH= Original image width
- * -# -DIMG_HEIGHT= Original image height
- * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
- *
- * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: QASYMM16
- * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
- * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
- * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
- * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
- * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
- * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
- * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
- * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
- * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: QASYMM8
- * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
- * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
- * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
- */
-__kernel void bounding_box_transform_quantized(
-    VECTOR_DECLARATION(boxes),
-    IMAGE_DECLARATION(pred_boxes),
-    IMAGE_DECLARATION(deltas))
-{
-    // Get pixels pointer
-    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
-    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
-    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
-
-    // Load delta and box values into registers
-    const float one     = 1.f;
-    const float halfone = 0.5f;
-
-    const int py           = get_global_id(1); // box
-    float4    scale_before = (float4)SCALE_BEFORE;
-    float4 delta           = DEQUANTIZE(vload4(0, (__global DATA_TYPE_DELTAS *)deltas.ptr), OFFSET_DELTAS, SCALE_DELTAS, DATA_TYPE_DELTAS, 4);
-    float4 box             = DEQUANTIZE(vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)), OFFSET_BOXES, SCALE_BOXES, DATA_TYPE, 4) / scale_before;
-
-    // Calculate width and centers of the old boxes
-    float2 dims    = box.s23 - box.s01 + one;
-    float2 ctr     = box.s01 + halfone * dims;
-    float4 weights = (float4)(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
-    delta /= weights;
-    delta.s23 = min(delta.s23, (float)BBOX_XFORM_CLIP);
-
-    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
-    float2 pred_ctr  = delta.s01 * dims + ctr;
-    float2 pred_dims = exp(delta.s23) * dims;
-
-    // Useful vector constant definitions
-    float4 max_values = (float4)(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
-    float4 sign       = (float4)(-1, -1, 1, 1);
-    float4 min_values = 0;
-
-    // Calculate the coordinates of the new boxes
-    float4 pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
-#ifdef OFFSET // Possibly adjust the predicted boxes
-    pred_box.s23 -= one;
-#endif // Possibly adjust the predicted boxes
-    pred_box = CLAMP(pred_box, min_values, max_values);
-#ifdef SCALE_AFTER // Possibly scale the predicted boxes
-    pred_box *= (float4)SCALE_AFTER;
-#endif // Possibly scale the predicted boxes
-
-    // Store them into the output
-    vstore4(QUANTIZE(pred_box, OFFSET_PRED_BOXES, SCALE_PRED_BOXES, DATA_TYPE, 4), 0, (__global DATA_TYPE *)pred_boxes.ptr);
-}
-#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/cast.cl b/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 036a683ec7..0000000000
--- a/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
-#else /* SATURATE */
-#define CONVERT_DOWN(x, type) CONVERT(x, type)
-#endif /* SATURATE */
-
-#define CONVERT_UP(x, type) CONVERT(x, type)
-
-/** This function performs a down-casting
- *
- * @attention For QSYMM8_PER_CHANNEL -> QASYMM8, it is user's responsibility to keep track of the quantization info.
- *
- * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void cast_down(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out))
-{
-    int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    __global uchar *in_addr  = in_ptr + in_offset_first_element_in_bytes + sizeof(DATA_TYPE_IN) * x_offs + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + sizeof(DATA_TYPE_OUT) * x_offs + get_global_id(1) * out_stride_y + get_global_id(2) * out_stride_z;
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
-
-#if defined(IS_DATA_TYPE_QUANTIZED)
-    in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE))0x80;
-#endif // defined(IS_DATA_TYPE_QUANTIZED)
-
-#if defined(IS_DATA_TYPE_FLOAT)
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#else  /* defined(IS_DATA_TYPE_FLOAT) */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif /* defined(IS_DATA_TYPE_FLOAT) */
-}
-
-/** This function performs a up-casting
- *
- * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/U16/S16/U32/S32/F16/F32
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void cast_up(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out))
-{
-    int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    __global uchar *in_addr  = in_ptr + in_offset_first_element_in_bytes + sizeof(DATA_TYPE_IN) * x_offs + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + sizeof(DATA_TYPE_OUT) * x_offs + get_global_id(1) * out_stride_y + get_global_id(2) * out_stride_z;
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
-
-#if defined(IS_DATA_TYPE_FLOAT)
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#else  /* defined(IS_DATA_TYPE_FLOAT) */
-    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif /* defined(IS_DATA_TYPE_FLOAT) */
-}
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/channel_shuffle.cl
deleted file mode 100644
index 63af2c6137..0000000000
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
-* Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
-
-// Check valid VEC_SIZES
-#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
-#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
-
-#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
-    ({                                                      \
-        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
-        uint r  = div_res * (y);                            \
-        mod_res = (x)-r;                                    \
-    })
-
-/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
- *
- * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
- * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
- *       K is equal to num_channels / num_groups.
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
-                                   TENSOR4D_DECLARATION(dst))
-{
-    uint curr_channel = 0; // channel id of input
-    uint batch_id     = 0; // batch id
-    uint group_id     = 0; // group id
-    uint channel_id   = 0; // channel id within the group
-
-    // Compute curr_channel and batch_id
-    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
-
-    // Compute group_id and channel_id
-    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
-
-    const uint x = get_global_id(0) * VEC_SIZE;
-    const uint y = get_global_id(1) * 2;
-    const uint z = channel_id * NUM_GROUPS + group_id;
-
-    // Load the Nx2 block
-    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-
-    // Store blocks
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
-    VSTORE(VEC_SIZE)
-    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
-    VSTORE(VEC_SIZE)
-    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
-
-/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
- *
- * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The third dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
- * @note The first dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_X=num. e.g. -DSRC_DIM_X=64
- * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
- *       K is equal to num_channels / num_groups.
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
-                                   TENSOR4D_DECLARATION(dst))
-{
-    // Offset computation
-    const uint curr_out_channel = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); // output feature map
-
-    uint z        = 0;
-    uint batch_id = 0;
-    // Compute curr_channel and batch_id
-    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, z);
-
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    curr_out_channels = (VEC_DATA_TYPE(uint, VEC_SIZE))(curr_out_channel) + VEC_OFFS(uint, VEC_SIZE);
-
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    in_channels = (curr_out_channels * (VEC_DATA_TYPE(uint, VEC_SIZE))(K)) % (VEC_DATA_TYPE(uint, VEC_SIZE))(SRC_DIM_X) + (curr_out_channels / (VEC_DATA_TYPE(uint, VEC_SIZE))(NUM_GROUPS));
-
-    // Load the values
-    const __global DATA_TYPE *input_ptr = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + z * src_stride_z + batch_id * src_stride_w);
-
-#if VEC_SIZE == 1
-    DATA_TYPE out0 = *((const __global * DATA_TYPE)(input_ptr) + in_channels);
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1)
-    };
-#elif VEC_SIZE == 3
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1),
-        *(input_ptr + in_channels.s2)
-    };
-#elif VEC_SIZE == 4
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1),
-        *(input_ptr + in_channels.s2),
-        *(input_ptr + in_channels.s3)
-    };
-#elif VEC_SIZE == 8
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1),
-        *(input_ptr + in_channels.s2),
-        *(input_ptr + in_channels.s3),
-        *(input_ptr + in_channels.s4),
-        *(input_ptr + in_channels.s5),
-        *(input_ptr + in_channels.s6),
-        *(input_ptr + in_channels.s7)
-    };
-#elif VEC_SIZE == 16
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    out0 =
-    {
-        *(input_ptr + in_channels.s0),
-        *(input_ptr + in_channels.s1),
-        *(input_ptr + in_channels.s2),
-        *(input_ptr + in_channels.s3),
-        *(input_ptr + in_channels.s4),
-        *(input_ptr + in_channels.s5),
-        *(input_ptr + in_channels.s6),
-        *(input_ptr + in_channels.s7),
-        *(input_ptr + in_channels.s8),
-        *(input_ptr + in_channels.s9),
-        *(input_ptr + in_channels.sa),
-        *(input_ptr + in_channels.sb),
-        *(input_ptr + in_channels.sc),
-        *(input_ptr + in_channels.sd),
-        *(input_ptr + in_channels.se),
-        *(input_ptr + in_channels.sf)
-    };
-#endif // VEC_SIZE == 1
-
-    __global uchar *output_ptr = dst_ptr + curr_out_channel * sizeof(DATA_TYPE) + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
-    STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/col2im.cl
deleted file mode 100644
index 59c2d8a3aa..0000000000
--- a/src/core/CL/cl_kernels/col2im.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
-
-#if ELEMENT_SIZE == 1
-#define COND_DATA_TYPE char
-#elif ELEMENT_SIZE == 2
-#define COND_DATA_TYPE short
-#elif ELEMENT_SIZE == 4
-#define COND_DATA_TYPE int
-#else // ELEMENT_SIZE
-#error "Element size not support"
-#endif // ELEMENT_SIZE
-
-/** This kernel performs a reshaping of the output of the convolution layer
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
- * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
- * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
- * @note The number of groups must be passed at compile time using  -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void col2im(
-    TENSOR3D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, 0);
-
-    const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
-    const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data = vload8(0, (__global DATA_TYPE *)src.ptr);
-
-    uint  x         = get_global_id(0) * 8;
-    uint8 x_clamped = x + (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
-
-    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
-    cond0 = CONVERT((x_clamped < WIDTH_INPUT), VEC_DATA_TYPE(COND_DATA_TYPE, 8));
-
-    // Clamp x if out-of-bounds
-    x_clamped = select((uint8)x, x_clamped, convert_int8(cond0));
-
-    // If out-of-bound, overwrite with the first element
-    data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
-
-#if NUM_GROUPS > 1
-    // Compute output offset (batches on 4th dimension)
-    int idx = yd * dst_stride_y + xd * dst_stride_x + (get_global_id(2) / NUM_GROUPS) * dst.stride_w;
-
-    const uint group = get_global_id(2) % NUM_GROUPS; // group ID
-    x_clamped += group * WIDTH_INPUT;
-#else  /* defined(NUM_GROUPS > 1 ) */
-    // Compute output offset (batches on 3rd dimension)
-    int idx = yd * dst.stride_y + xd * dst.stride_x + get_global_id(2) * dst.stride_w;
-#endif /* NUM_GROUPS > 1 */
-
-    // Store value
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s0 * dst.stride_z)) = data.s0;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s1 * dst.stride_z)) = data.s1;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s2 * dst.stride_z)) = data.s2;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s3 * dst.stride_z)) = data.s3;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s4 * dst.stride_z)) = data.s4;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s5 * dst.stride_z)) = data.s5;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s6 * dst.stride_z)) = data.s6;
-    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s7 * dst.stride_z)) = data.s7;
-}
-#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
diff --git a/src/core/CL/cl_kernels/common/activation_layer.cl b/src/core/CL/cl_kernels/common/activation_layer.cl
new file mode 100644
index 0000000000..a04556a1ed
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/activation_layer.cl
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(ACT) && defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#include "activation_float_helpers.h"
+
+/** This performs an activation function floating point inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void activation_layer(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    // Perform activation
+    data0 = ACTIVATION(ACT, DATA_TYPE, VEC_SIZE, data0, A_VAL, B_VAL);
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#endif /* defined(ACT) */
diff --git a/src/core/CL/cl_kernels/common/activation_layer_quant.cl b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
new file mode 100644
index 0000000000..38ee00b17a
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_quant_helpers.h"
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+
+#if defined(FLOAT_DOMAIN)
+// Activations performed in the float domain
+
+#include "activation_float_helpers.h"
+
+/** This performs an activation function on quantized inputs with float transformations.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in only if asymmetric with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant_f32(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    VEC_FLOAT data_flt = CONVERT(data0, VEC_FLOAT);
+#if defined(O1_VAL)
+    data_flt = round(data_flt - (float)O1_VAL) * ((float)S1_VAL);
+#else  // defined(O1_VAL)
+    data_flt                    = round(data_flt) * ((float)S1_VAL);
+#endif // defined(O1_VAL)
+    data_flt = ACTIVATION(ACT, float, VEC_SIZE, data_flt, A_VAL, B_VAL);
+
+#if defined(O2_VAL)
+    data0 = CONVERT_SAT(round(data_flt / ((float)S2_VAL)) + (float)O2_VAL, TYPE);
+#else  // defined(O2_VAL)
+    data0                       = CONVERT_SAT(round(data_flt / ((float)S2_VAL)), TYPE);
+#endif // defined(O2_VAL)
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#else // defined(FLOAT_DOMAIN)
+// Activations performed in the quantized domain
+
+#if defined(ACT)
+/** This performs an activation function on quantized inputs.
+ *
+ * @note In order to perform the activation function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Activation function should be given as a preprocessor argument using -DACT=name. e.g. -DACT=TANH
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Quantization scales of the input/output tensors are passed in with -DS1_VAL= and -DS2_VAL= respectively.
+ * @note Quantization offsets of the input/output tensors are passed in with -DO1_VAL= and -DO2_VAL= respectively.
+ * @note Quantized value of constant zero should be given as a preprocessor argument using -DCONST_0=value. e.g. -DCONST_0=128.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void activation_layer_quant(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    // Get pixels pointer
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_addr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+
+    // Load data
+    TYPE data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    data0 = PERFORM_ACTIVATION_QUANT(ACT, data0);
+
+    // Store result
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(ACT)
+#endif // defined(FLOAT_DOMAIN)
diff --git a/src/core/CL/cl_kernels/common/arg_min_max.cl b/src/core/CL/cl_kernels/common/arg_min_max.cl
new file mode 100644
index 0000000000..6e57ed0af1
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/arg_min_max.cl
@@ -0,0 +1,451 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
+
+#define VEC_TYPE_IN VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_TYPE_OUT VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
+#define VEC_SELECT_IN SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_SIGNED_INT_IN SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) (VEC_SELECT_IN) isgreater(x, y)
+#define ISLESS(x, y) (VEC_SELECT_IN) isless(x, y)
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x > y))
+#define ISLESS(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x < y))
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(ARG_MAX)
+#define CONDITION_TO_USE(x, y) ISGREATER(x, y)
+#elif defined(ARG_MIN)
+#define CONDITION_TO_USE(x, y) ISLESS(x, y)
+#else // !(defined(ARG_MAX) || defined(ARG_MIN))
+#error "Unsupported reduction operation!"
+#endif // defined(ARG_MAX)
+
+#if defined(WIDTH)
+#if defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
+{
+    int end_elem = (x_idx + 1) * 16;
+    if(end_elem > WIDTH)
+    {
+        end_elem = WIDTH - x_idx * 16;
+    }
+    DATA_TYPE_OUTPUT res = prev_res[0];
+    for(int x_v = 1; x_v < end_elem; ++x_v)
+    {
+        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) < * (input + res));
+    }
+    return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index minimum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_min(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+    DATA_TYPE_OUTPUT res = 0;
+    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+    {
+        res = select(res, x_v, *(input + x_v) < * (input + res));
+    }
+    return res;
+#else  // WIDTH >= 16
+    int       x_elem   = x_idx * 16;
+    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+    x_elem -= x_goback;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input - x_goback);
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 8)
+    idx_sel       = (in.s01234567 <= in.s89abcdef);
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+    idx_sel.s0123 = (in.s0123 < in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+    idx_sel.s01 = (in.s01 < in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 2)));
+    in.s01      = select(in.s23, in.s01, idx_sel.s01);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+    idx_sel.s0 = (in.s0 < in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), SIGNED_INT_DATA_TYPE(DATA_TYPE)));
+    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+    return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MIN)
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global const DATA_TYPE *input, __global const DATA_TYPE_OUTPUT *prev_res, const int x_idx)
+{
+    int end_elem = (x_idx + 1) * 16;
+    if(end_elem > WIDTH)
+    {
+        end_elem = WIDTH - x_idx * 16;
+    }
+    DATA_TYPE_OUTPUT res = prev_res[0];
+    for(int x_v = 1; x_v < end_elem; ++x_v)
+    {
+        res = select(res, prev_res[x_v], *(input + prev_res[x_v]) > *(input + res));
+    }
+    return res;
+}
+#else // !defined(PREV_OUTPUT)
+/** Find index maximum value of a vector
+ *
+ * @param[in] input Pointer to the first value.
+ *
+ * @return index of the vector.
+ */
+inline DATA_TYPE_OUTPUT arg_idx_max(__global const DATA_TYPE *input, const int x_idx)
+{
+#if WIDTH < 16
+    DATA_TYPE_OUTPUT res = 0;
+    for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
+    {
+        res = select(res, x_v, *(input + x_v) > *(input + res));
+    }
+    return res;
+#else  // WIDTH >= 16
+    int       x_elem   = x_idx * 16;
+    const int x_goback = select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
+    x_elem -= x_goback;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    in = vload16(0, input - x_goback);
+    VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16)
+    res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+
+    SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 8)
+    idx_sel       = (in.s01234567 >= in.s89abcdef);
+    in.s01234567  = select(in.s89abcdef, in.s01234567, idx_sel);
+    res.s01234567 = select(res.s89abcdef, res.s01234567, CONVERT(idx_sel, int8));
+
+    idx_sel.s0123 = (in.s0123 > in.s4567) || (in.s0123 == in.s4567 && CONVERT((res.s0123 < res.s4567), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 4)));
+    in.s0123      = select(in.s4567, in.s0123, idx_sel.s0123);
+    res.s0123     = select(res.s4567, res.s0123, CONVERT(idx_sel.s0123, int4));
+
+    idx_sel.s01 = (in.s01 > in.s23) || (in.s01 == in.s23 && CONVERT((res.s01 < res.s23), SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, 2)));
+    in.s01      = select(in.s23, in.s01, idx_sel.s01);
+    res.s01     = select(res.s23, res.s01, CONVERT(idx_sel.s01, int2));
+
+    idx_sel.s0 = (in.s0 > in.s1) || (in.s0 == in.s1 && CONVERT((res.s0 < res.s1), SIGNED_INT_DATA_TYPE(DATA_TYPE)));
+    res.s0     = select(res.s1, res.s0, CONVERT(idx_sel.s0, int));
+
+    return res.s0 + x_elem;
+#endif // WIDTH < 16
+}
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX)
+
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note In case the results of previous stages are passed the flag PREV_OUTPUT has to be passed using -DPREV_OUTPUT
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The arg_max flag must be passed at compile time using -DARG_MAX if we want to compute the ArgMax
+ * @note The arg_min flag must be passed at compile time using -DARG_MIN if we want to compute the ArgMin
+ *
+ * @param[in] src_ptr                                   Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] src_stride_x                              Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x                                src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                              Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y                                src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes         The offset of the first element in the source tensor
+ * @param[in] prev_res_ptr                              (Optional) Pointer to previous results tensor. Supported data types: U32/S32
+ * @param[in] prev_res_stride_x                         (Optional) Stride of the output tensor in X dimension (in bytes)
+ * @param[in] prev_res_step_x                           (Optional) prev_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] prev_res_stride_y                         (Optional) Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] prev_res_step_y                           (Optional) prev_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] prev_res_offset_first_element_in_bytes    (Optional) The offset of the first element in the previous results tensor
+ * @param[in] partial_res_ptr                           The local buffer to hold partial result values. Supported data types: U32/S32
+ * @param[in] partial_res_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] partial_res_step_x                        partial_res_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] partial_res_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] partial_res_step_y                        partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] local_results                             Local buffer for storing the partial result
+ */
+__kernel void arg_min_max_x(
+    IMAGE_DECLARATION(src),
+#if defined(PREV_OUTPUT)
+    IMAGE_DECLARATION(prev_res),
+#endif // defined(PREV_OUTPUT)
+    IMAGE_DECLARATION(partial_res),
+    __local DATA_TYPE_OUTPUT *local_results)
+{
+#if defined(PREV_OUTPUT)
+    Image src      = CONVERT_TO_IMAGE_STRUCT_NO_STEP(src);
+    Image prev_res = CONVERT_TO_IMAGE_STRUCT(prev_res);
+#else  // !defined(PREV_OUTPUT)
+    Image src                      = CONVERT_TO_IMAGE_STRUCT(src);
+#endif // defined(PREV_OUTPUT)
+    Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
+
+    unsigned int lsize = get_local_size(0);
+    unsigned int lid   = get_local_id(0);
+
+    const uint     x_idx                 = get_global_id(0);
+    const uint     y_idx                 = get_global_id(1);
+    const __global DATA_TYPE *src_in_row = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
+
+    for(unsigned int y = 0; y < get_local_size(1); ++y)
+    {
+#if defined(ARG_MAX)
+#if defined(PREV_OUTPUT)
+        local_results[lid] = arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else  // !defined(PREV_OUTPUT)
+        local_results[lid] = arg_idx_max((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#else  // defined(ARG_MIN)
+#if defined(PREV_OUTPUT)
+        local_results[lid]         = arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)offset(&prev_res, 0, y), x_idx);
+#else  // !defined(PREV_OUTPUT)
+        local_results[lid] = arg_idx_min((__global DATA_TYPE *)offset(&src, 0, y), x_idx);
+#endif // defined(PREV_OUTPUT)
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Looking for the next highest power of 2 (maximum value of lsize is 8)
+        unsigned int middle = lsize - 1;
+        middle |= middle >> 1;
+        middle |= middle >> 2;
+        middle += 1;
+        // Perform parallel reduction
+        for(unsigned int i = middle; i > 0; i >>= 1)
+        {
+            if(lid < i && lid + i < lsize)
+            {
+                DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
+                DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
+#if defined(ARG_MAX)
+                local_results[lid] = select(
+                                         local_results[lid],
+                                         local_results[lid + i],
+                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1));
+#else  // defined(ARG_MIN)
+                local_results[lid] = select(
+                                         local_results[lid],
+                                         local_results[lid + i],
+                                         ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
+#endif // defined(ARG_MAX) || defined(ARG_MIN)
+            }
+            barrier(CLK_LOCAL_MEM_FENCE);
+        }
+
+        if(lid == 0)
+        {
+            ((__global DATA_TYPE_OUTPUT *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
+        }
+    }
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The data type of the output must be passed at compile time using -DDATA_TYPE_OUTPUT: e.g. -DDATA_TYPE_OUTPUT=uint
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_y(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT y = 1; y < HEIGHT; ++y)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)y, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH) && !defined(BATCH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)z, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DEPTH)  && !defined(BATCH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: U32/S32
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void arg_min_max_w(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + (get_global_id(2) % DEPTH) * input_stride_z +
+                                  (get_global_id(2) / DEPTH) * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + (get_global_id(
+                                      2) % DEPTH) * output_stride_z + (get_global_id(2) / DEPTH) * output_stride_w;
+
+    VEC_TYPE_IN res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_TYPE_IN);
+
+    VEC_TYPE_OUT indx0 = 0;
+    for(DATA_TYPE_OUTPUT w = 1; w < BATCH; ++w)
+    {
+        VEC_TYPE_IN in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_TYPE_IN);
+
+        VEC_TYPE_OUT cond_conv = CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
+        indx0                  = select(indx0, (VEC_TYPE_OUT)w, cond_conv);
+        res                    = select(res, in, CONDITION_TO_USE(in, res));
+    }
+
+    // Store result
+    STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/batchnormalization_layer.cl b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
new file mode 100644
index 0000000000..18f54907df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(EPSILON)
+/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
+ *
+ * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
+ * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
+ *       For depthwise convolution weight do not pass DIM2
+ * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
+ * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  w_ptr                                 Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in]  w_stride_x                            Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  w_step_x                              w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_stride_y                            Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  w_step_y                              w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_stride_z                            Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  w_step_z                              w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_offset_first_element_in_bytes       The offset of the first element in the weights tensor
+ * @param[in]  b_ptr                                 (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_stride_x                            (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  b_step_x                              (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_stride_y                            (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in]  b_step_y                              (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  b_stride_z                            (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in]  b_step_z                              (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  b_offset_first_element_in_bytes       (Optional) The offset of the first element in the bias tensor
+ * @param[in]  mean_ptr                              Pointer to the mean source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  mean_stride_x                         Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                           mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes    The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                               Pointer to the var tensor. Supported data types: same as @p w_ptr
+ * @param[in]  var_stride_x                          Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                            var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes     The offset of the first element in the var source tensor
+ * @param[out] w_fused_ptr                           (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
+ * @param[in]  w_fused_stride_x                      (Optional) Stride of the destination weights tensor in X dimension (in bytes)
+ * @param[in]  w_fused_step_x                        (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_y                      (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
+ * @param[in]  w_fused_step_y                        (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  w_fused_stride_z                      (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
+ * @param[in]  w_fused_step_z                        (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
+ * @param[in]  b_fused_ptr                           (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
+ * @param[in]  b_fused_stride_x                      (Optional) Stride of the destination bias tensor in X dimension (in bytes)
+ * @param[in]  b_fused_step_x                        (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
+ * @param[in]  beta_ptr                              (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  beta_stride_x                         (Optional) Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                           (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes    (Optional) The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                             (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
+ * @param[in]  gamma_stride_x                        (Optional) Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                          (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes   (Optional) The offset of the first element in the gamma source tensor
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
+#if defined(BIAS)
+                                            VECTOR_DECLARATION(b),
+#endif // defined(BIAS)
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var)
+#ifndef IN_PLACE_W
+                                            ,
+                                            TENSOR3D_DECLARATION(w_fused)
+#endif // ifndef IN_PLACE_W
+#ifndef IN_PLACE_B
+                                            ,
+                                            VECTOR_DECLARATION(b_fused)
+#endif // ifndef IN_PLACE_B
+#if defined(BETA)
+                                            ,
+                                            VECTOR_DECLARATION(beta)
+#endif // defined(BETA)
+#if defined(GAMMA)
+                                            ,
+                                            VECTOR_DECLARATION(gamma)
+#endif // defined(GAMMA)
+                                           )
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+#if defined(DIM2)
+    int c0 = z % DIM2;
+    int c1 = z / DIM2;
+#else // ! defined(DIM2)
+    int c0                                                                                    = 0;
+#if defined(NHWC)
+    int c1                                                                                    = x;
+#else  // defined(NHWC)
+    int c1 = z;
+#endif // defined(NHWC)
+#endif // defined(DIM2)
+
+    int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
+    int v_offset = c1 * sizeof(DATA_TYPE);
+
+    DATA_TYPE w_old = 0.0f;
+    DATA_TYPE b_old = 0.0f;
+    DATA_TYPE w_new = 0.0f;
+    DATA_TYPE b_new = 0.0f;
+    DATA_TYPE gamma = 1.0f;
+    DATA_TYPE mean  = 0.0f;
+    DATA_TYPE var   = 1.0f;
+    DATA_TYPE beta  = 0.0f;
+
+    w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
+    var   = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
+    mean  = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
+
+#if defined(GAMMA)
+    gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
+#endif // defined(GAMMA)
+
+    // Compute new weight
+    w_new = (gamma * w_old) / (sqrt(var + EPSILON));
+
+#if defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
+#else  // defined(IN_PLACE_W)
+    *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
+#endif // defined(IN_PLACE_W)
+
+    // Compute bias
+#if !defined(DIM2) && defined(NHWC)
+    if(z == 0 && y == 0)
+#else  // !defined(DIM2) && defined(NHWC)
+    if(x == 0 && y == 0 && c0 == 0)
+#endif // !defined(DIM2) && defined(NHWC)
+    {
+#if defined(BIAS)
+        b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
+#endif // defined(BIAS)
+#if defined(BETA)
+        beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
+#endif // defined(BETA)
+
+        b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
+
+#if defined(BIAS)
+
+#if defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
+#else  // defined(IN_PLACE_B)
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // defined(IN_PLACE_B)
+
+#else // defined(BIAS)
+
+#ifndef IN_PLACE_B
+        *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // ifndef IN_PLACE_B
+
+#endif // defined(BIAS)
+    }
+}
+#endif // defined(DATA_TYPE) && defined(EPSILON)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/bitwise_op.cl b/src/core/CL/cl_kernels/common/bitwise_op.cl
new file mode 100644
index 0000000000..e142c1d275
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/bitwise_op.cl
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+/** This function computes the bitwise OR of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_or(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a | in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise AND of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_and(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a & in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise XOR of two input images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_xor(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(in2),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x_offs + get_global_id(1) * in2_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global uchar *)in2_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = in_a ^ in_b;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function computes the bitwise NOT of an images.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void bitwise_not(
+    IMAGE_DECLARATION(in1),
+    IMAGE_DECLARATION(out))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x_offs + get_global_id(1) * in1_step_y;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + x_offs + get_global_id(1) * out_step_y;
+
+    // Load data
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global uchar *)in1_addr);
+
+    VEC_DATA_TYPE(uchar, VEC_SIZE)
+    data0 = ~in_a;
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, uchar, (__global uchar *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/bounding_box_transform.cl b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
new file mode 100644
index 0000000000..f2e9cb0ed0
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) // Check for compile time constants
+
+/** Transform proposal bounding boxes to target bounding box using bounding box deltas.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: F16/F32
+ * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: same as @p in_ptr
+ * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform(
+    VECTOR_DECLARATION(boxes),
+    IMAGE_DECLARATION(pred_boxes),
+    IMAGE_DECLARATION(deltas))
+{
+    // Get pixels pointer
+    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+    // Load delta and box values into registers
+    const DATA_TYPE one     = (DATA_TYPE)1.f;
+    const DATA_TYPE halfone = (DATA_TYPE)0.5f;
+
+    const int py = get_global_id(1); // box
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    scale_before = (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_BEFORE;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    delta = vload4(0, (__global DATA_TYPE *)deltas.ptr);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    box = vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)) / scale_before;
+
+    // Calculate width and centers of the old boxes
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    dims = box.s23 - box.s01 + one;
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    ctr = box.s01 + halfone * dims;
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    weights = (VEC_DATA_TYPE(DATA_TYPE, 4))(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+    delta /= weights;
+    delta.s23 = min(delta.s23, (DATA_TYPE)BBOX_XFORM_CLIP);
+
+    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    pred_ctr = delta.s01 * dims + ctr;
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    pred_dims = exp(delta.s23) * dims;
+
+    // Useful vector constant definitions
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    max_values = (VEC_DATA_TYPE(DATA_TYPE, 4))(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    sign = (VEC_DATA_TYPE(DATA_TYPE, 4))(-1, -1, 1, 1);
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    min_values = 0;
+
+    // Calculate the coordinates of the new boxes
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+    pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+    pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+    pred_box *= (VEC_DATA_TYPE(DATA_TYPE, 4))SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+    // Store them into the output
+    vstore4(pred_box, 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+
+#endif // defined(DATA_TYPE) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE)
diff --git a/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
new file mode 100644
index 0000000000..c1d45a56b9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if defined(DATA_TYPE) && defined(DATA_TYPE_DELTAS) && defined(WEIGHT_X) && defined(WEIGHT_Y) && defined(WEIGHT_W) && defined(WEIGHT_H) && defined(IMG_WIDTH) && defined(IMG_HEIGHT) && defined(BOX_FIELDS) && defined(SCALE_BEFORE) && defined(OFFSET_BOXES) && defined(SCALE_BOXES) && defined(OFFSET_DELTAS) && defined(SCALE_DELTAS) && defined(OFFSET_PRED_BOXES) && defined(SCALE_PRED_BOXES) // Check for compile time constants
+
+/** Transform proposal bounding boxes to target bounding box using bounding box deltas for quantized data types.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM16 for boxes and pred_boxes, QASYMM8 for for deltas
+ * -# -DWEIGHT{X,Y,W,H}= Weights [wx, wy, ww, wh] for the deltas
+ * -# -DIMG_WIDTH= Original image width
+ * -# -DIMG_HEIGHT= Original image height
+ * -# -DBOX_FIELDS= Number of fields that are used to represent a box in boxes
+ *
+ * @param[in]  boxes_ptr                                Pointer to the boxes tensor. Supported data types: QASYMM16
+ * @param[in]  boxes_stride_x                           Stride of the boxes tensor in X dimension (in bytes)
+ * @param[in]  boxes_step_x                             boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  boxes_stride_y                           Stride of the boxes tensor in Y dimension (in bytes)
+ * @param[in]  boxes_step_y                             boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  boxes_stride_z                           Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  boxes_step_z                             boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  boxes_offset_first_element_in_bytes      The offset of the first element in the boxes tensor
+ * @param[out] pred_boxes_ptr                           Pointer to the predicted boxes. Supported data types: same as @p in_ptr
+ * @param[in]  pred_boxes_stride_x                      Stride of the predicted boxes in X dimension (in bytes)
+ * @param[in]  pred_boxes_step_x                        pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_y                      Stride of the predicted boxes in Y dimension (in bytes)
+ * @param[in]  pred_boxes_step_y                        pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  pred_boxes_stride_z                      Stride of the predicted boxes in Z dimension (in bytes)
+ * @param[in]  pred_boxes_step_z                        pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  pred_boxes_offset_first_element_in_bytes The offset of the first element in the predicted boxes
+ * @param[in]  deltas_ptr                               Pointer to the deltas tensor. Supported data types: QASYMM8
+ * @param[in]  deltas_stride_x                          Stride of the deltas tensor in X dimension (in bytes)
+ * @param[in]  deltas_step_x                            deltas_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  deltas_stride_y                          Stride of the deltas tensor in Y dimension (in bytes)
+ * @param[in]  deltas_step_y                            deltas_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  deltas_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  deltas_step_z                            deltas_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  deltas_offset_first_element_in_bytes     The offset of the first element in the deltas tensor
+ */
+__kernel void bounding_box_transform_quantized(
+    VECTOR_DECLARATION(boxes),
+    IMAGE_DECLARATION(pred_boxes),
+    IMAGE_DECLARATION(deltas))
+{
+    // Get pixels pointer
+    Vector boxes      = CONVERT_TO_VECTOR_STRUCT_NO_STEP(boxes);
+    Image  pred_boxes = CONVERT_TO_IMAGE_STRUCT(pred_boxes);
+    Image  deltas     = CONVERT_TO_IMAGE_STRUCT(deltas);
+
+    // Load delta and box values into registers
+    const float one     = 1.f;
+    const float halfone = 0.5f;
+
+    const int py           = get_global_id(1); // box
+    float4    scale_before = (float4)SCALE_BEFORE;
+    float4 delta           = DEQUANTIZE(vload4(0, (__global DATA_TYPE_DELTAS *)deltas.ptr), OFFSET_DELTAS, SCALE_DELTAS, DATA_TYPE_DELTAS, 4);
+    float4 box             = DEQUANTIZE(vload4(0, (__global DATA_TYPE *)vector_offset(&boxes, BOX_FIELDS * py)), OFFSET_BOXES, SCALE_BOXES, DATA_TYPE, 4) / scale_before;
+
+    // Calculate width and centers of the old boxes
+    float2 dims    = box.s23 - box.s01 + one;
+    float2 ctr     = box.s01 + halfone * dims;
+    float4 weights = (float4)(WEIGHT_X, WEIGHT_Y, WEIGHT_W, WEIGHT_H);
+    delta /= weights;
+    delta.s23 = min(delta.s23, (float)BBOX_XFORM_CLIP);
+
+    // Calculate widths and centers of the new boxes (translation + aspect ratio transformation)
+    float2 pred_ctr  = delta.s01 * dims + ctr;
+    float2 pred_dims = exp(delta.s23) * dims;
+
+    // Useful vector constant definitions
+    float4 max_values = (float4)(IMG_WIDTH - 1, IMG_HEIGHT - 1, IMG_WIDTH - 1, IMG_HEIGHT - 1);
+    float4 sign       = (float4)(-1, -1, 1, 1);
+    float4 min_values = 0;
+
+    // Calculate the coordinates of the new boxes
+    float4 pred_box = pred_ctr.s0101 + sign * halfone * pred_dims.s0101;
+#ifdef OFFSET // Possibly adjust the predicted boxes
+    pred_box.s23 -= one;
+#endif // Possibly adjust the predicted boxes
+    pred_box = CLAMP(pred_box, min_values, max_values);
+#ifdef SCALE_AFTER // Possibly scale the predicted boxes
+    pred_box *= (float4)SCALE_AFTER;
+#endif // Possibly scale the predicted boxes
+
+    // Store them into the output
+    vstore4(QUANTIZE(pred_box, OFFSET_PRED_BOXES, SCALE_PRED_BOXES, DATA_TYPE, 4), 0, (__global DATA_TYPE *)pred_boxes.ptr);
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/cast.cl b/src/core/CL/cl_kernels/common/cast.cl
new file mode 100644
index 0000000000..036a683ec7
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/cast.cl
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_DOWN(x, type) CONVERT_SAT(x, type)
+#else /* SATURATE */
+#define CONVERT_DOWN(x, type) CONVERT(x, type)
+#endif /* SATURATE */
+
+#define CONVERT_UP(x, type) CONVERT(x, type)
+
+/** This function performs a down-casting
+ *
+ * @attention For QSYMM8_PER_CHANNEL -> QASYMM8, it is user's responsibility to keep track of the quantization info.
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast_down(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *in_addr  = in_ptr + in_offset_first_element_in_bytes + sizeof(DATA_TYPE_IN) * x_offs + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + sizeof(DATA_TYPE_OUT) * x_offs + get_global_id(1) * out_stride_y + get_global_id(2) * out_stride_z;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
+
+#if defined(IS_DATA_TYPE_QUANTIZED)
+    in_data ^= (VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE))0x80;
+#endif // defined(IS_DATA_TYPE_QUANTIZED)
+
+#if defined(IS_DATA_TYPE_FLOAT)
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(IS_DATA_TYPE_FLOAT) */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_DOWN(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(IS_DATA_TYPE_FLOAT) */
+}
+
+/** This function performs a up-casting
+ *
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE_IN and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN=uchar -DDATA_TYPE_OUT=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8/U16/S16/U32/S32/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void cast_up(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    __global uchar *in_addr  = in_ptr + in_offset_first_element_in_bytes + sizeof(DATA_TYPE_IN) * x_offs + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + sizeof(DATA_TYPE_OUT) * x_offs + get_global_id(1) * out_stride_y + get_global_id(2) * out_stride_z;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)in_addr);
+
+#if defined(IS_DATA_TYPE_FLOAT)
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(IS_DATA_TYPE_FLOAT) */
+    VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
+    res0 = CONVERT_UP(in_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(IS_DATA_TYPE_FLOAT) */
+}
diff --git a/src/core/CL/cl_kernels/common/col2im.cl b/src/core/CL/cl_kernels/common/col2im.cl
new file mode 100644
index 0000000000..89054dcb31
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/col2im.cl
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+/** This kernel performs a reshaping of the output of the convolution layer
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of the input tensor must be passed at compile time using -DWIDTH_INPUT: e.g. -DWIDTH_INPUT=320
+ * @note The width of the output tensor must be passed at compile time using -DWIDTH_OUTPUT: e.g. -DWIDTH_OUTPUT=600
+ * @note The element size must be passed at compile time using -DELEMENT_SIZE: e.g. -DELEMENT_SIZE=4
+ * @note The number of groups must be passed at compile time using  -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void col2im(
+    TENSOR3D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, 0);
+
+    const uint xd = get_global_id(1) % WIDTH_OUTPUT; // x coordinate of the destination tensor
+    const uint yd = get_global_id(1) / WIDTH_OUTPUT; // y coordinate of the destination tensor
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    uint  x         = get_global_id(0) * 8;
+    uint8 x_clamped = x + (uint8)(0, 1, 2, 3, 4, 5, 6, 7);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 8)
+    cond0 = CONVERT((x_clamped < WIDTH_INPUT), VEC_DATA_TYPE(COND_DATA_TYPE, 8));
+
+    // Clamp x if out-of-bounds
+    x_clamped = select((uint8)x, x_clamped, convert_int8(cond0));
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 8))data.s0, data, cond0);
+
+#if NUM_GROUPS > 1
+    // Compute output offset (batches on 4th dimension)
+    int idx = yd * dst_stride_y + xd * dst_stride_x + (get_global_id(2) / NUM_GROUPS) * dst.stride_w;
+
+    const uint group = get_global_id(2) % NUM_GROUPS; // group ID
+    x_clamped += group * WIDTH_INPUT;
+#else  /* defined(NUM_GROUPS > 1 ) */
+    // Compute output offset (batches on 3rd dimension)
+    int idx = yd * dst.stride_y + xd * dst.stride_x + get_global_id(2) * dst.stride_w;
+#endif /* NUM_GROUPS > 1 */
+
+    // Store value
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s0 * dst.stride_z)) = data.s0;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s1 * dst.stride_z)) = data.s1;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s2 * dst.stride_z)) = data.s2;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s3 * dst.stride_z)) = data.s3;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s4 * dst.stride_z)) = data.s4;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s5 * dst.stride_z)) = data.s5;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s6 * dst.stride_z)) = data.s6;
+    *((__global DATA_TYPE *)(dst.ptr + idx + x_clamped.s7 * dst.stride_z)) = data.s7;
+}
+#endif // defined(DATA_TYPE) && defined(WIDTH_OUTPUT) && defined(ELEMENT_SIZE) && defined(WIDTH_INPUT) && defined(NUM_GROUPS)
diff --git a/src/core/CL/cl_kernels/common/comparisons.cl b/src/core/CL/cl_kernels/common/comparisons.cl
new file mode 100644
index 0000000000..f05cb87835
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/comparisons.cl
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define EQUAL(x, y) ((x) == (y))
+#define NOTEQUAL(x, y) ((x) != (y))
+#define GREATER(x, y) ((x) > (y))
+#define GREATEREQUAL(x, y) ((x) >= (y))
+#define LESS(x, y) ((x) < (y))
+#define LESSEQUAL(x, y) ((x) <= (y))
+
+#define DEFINE_KERNEL_STR(name) compare_##name
+#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
+
+#define DEFINE_KERNEL_QUANTIZED_STR(name) compare_##name##_quantized
+#define DEFINE_KERNEL_QUANTIZED(name) DEFINE_KERNEL_QUANTIZED_STR(name)
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
+/** This function compares two tensors.
+ *
+ * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void DEFINE_KERNEL(OP_NAME)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_a = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_b = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2.ptr);
+
+    // Calculate and store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
+/** This function compares two quantized tensors.
+ *
+ * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All quantized data types.
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16);
+    int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16);
+
+    in_a = in_a - (int16)((int)OFFSET_IN1);
+    in_b = in_b - (int16)((int)OFFSET_IN2);
+
+    const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
+    const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
+    const int16   res    = OP(in1f32, in2f32);
+
+    // Store result
+    vstore16(convert_uchar16(res), 0, (__global uchar *)out.ptr);
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/concatenate.cl b/src/core/CL/cl_kernels/common/concatenate.cl
new file mode 100644
index 0000000000..394b20c739
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/concatenate.cl
@@ -0,0 +1,415 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
+{
+    const VEC_FLOAT in_f32  = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
+    const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
+    const VEC_QUANT res_q8  = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
+    return res_q8;
+}
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(DATA_TYPE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(DEPTH) && defined(ELEMENT_SIZE)
+#if defined(INPUT1_WIDTH)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define SEQ VEC_OFFS(int, VEC_SIZE)
+
+/** This kernel concatenates two input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All.
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x2(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(dst))
+{
+    // Calculate input indices
+    const int x  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y  = get_global_id(1);
+    const int z  = get_global_id(2) % (int)DEPTH;
+    const int w  = get_global_id(2) / (int)DEPTH;
+    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
+    const int x2 = max(x - (int)INPUT1_WIDTH, 0);
+
+    // Calculate inputs and output addresses
+    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
+    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+
+    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
+    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1)  && defined(SCALE_IN2) && defined(SCALE_OUT) */
+    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
+
+    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
+    SELECT_TYPE cond = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    src1_values      = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
+    src2_values      = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
+
+    cond                   = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
+    const VEC_TYPE values0 = select(src2_values, src1_values, cond);
+
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
+/** This kernel concatenates four input tensors into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
+ * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
+ * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
+ *
+ * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src3_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src3_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src3_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src3_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src3_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src3_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src3_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src3_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  src4_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  src4_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src4_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src4_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src4_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src4_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src4_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src4_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void concatenate_width_x4(
+    TENSOR4D_DECLARATION(src1),
+    TENSOR4D_DECLARATION(src2),
+    TENSOR4D_DECLARATION(src3),
+    TENSOR4D_DECLARATION(src4),
+    TENSOR4D_DECLARATION(dst))
+{
+    // Calculate input indices
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
+    const int x2 = min(max(x - (int)INPUT1_WIDTH, 0), (int)INPUT2_WIDTH - (int)VEC_SIZE);
+    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, 0), (int)INPUT3_WIDTH - (int)VEC_SIZE);
+    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, 0);
+
+    // Calculate inputs and output addresses
+    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
+    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
+    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
+    const __global uchar *src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
+    const __global uchar *src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
+
+    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
+    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
+    VEC_TYPE src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
+    VEC_TYPE src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
+    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
+    src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
+    src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
+
+    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
+
+    SELECT_TYPE cond_in2 = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    SELECT_TYPE cond_in3 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH - VEC_SIZE)), SELECT_TYPE);
+    SELECT_TYPE cond_in4 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH - VEC_SIZE)), SELECT_TYPE);
+
+    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
+    src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
+    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
+    // Rotate src2/3_values, if values0 is a combination of src2_values and src3_values.
+    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
+    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
+    // Rotate src3/4_values, if values0 is a combination of src3_values and src4_values.
+    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
+    src4_values = select(src4_values, ROTATE(src4_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
+
+    cond_in2 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
+    cond_in3 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
+    cond_in4 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
+
+    VEC_TYPE values0 = select(src2_values, src1_values, cond_in2);
+    values0          = select(src3_values, values0, cond_in3);
+    values0          = select(src4_values, values0, cond_in4);
+
+    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
+#endif /* defined(INPUT1_WIDTH) */
+#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
+
+#if defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** This kernel concatenates the input tensor into the output tensor along the first dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+
+__kernel void concatenate_width(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    // Calculate input indices
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % (int)DEPTH;
+    const int w = get_global_id(2) / (int)DEPTH;
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z + w * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + w * dst_stride_w;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+
+#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
+
+#if defined(VEC_SIZE_LEFTOVER)
+
+#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
+/** This kernel concatenates the input tensor into the output tensor along the second dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Vector sizes supported are 2,4,8 and 16.
+ * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
+ * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+
+__kernel void concatenate_height(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst))
+{
+    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
+                                   2) / DEPTH) * src_stride_w;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
+                                   2) / DEPTH) * dst_stride_w;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+}
+
+#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
+
+/** This kernel concatenates the input tensor into the output tensor along the third dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  offsets                           The offsets to the first valid element of the output tensor in bytes
+ */
+__kernel void concatenate(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    int offset)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(VEC_SIZE_LEFTOVER) */
+#endif /* defined(DATA_TYPE) */
+#endif /* defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/common/convert_fc_weights.cl b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
new file mode 100644
index 0000000000..01ef04a7d6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
+/** Perform a NCHW -> NHWC or NHWC -> NCHW conversion for Fully Connected 2D weights.
+ *
+ * For NCHW -> NHWC, FACTOR_1 will be equal to the product of the first two dimensions of FullyConnectedLayer's input and FACTOR_2 will represent the number of channels of that tensor.
+ * For NHWC -> NCHW, FACTOR_1 and FACTOR_2 will hold the same values, but swapped.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void convert_fc_weights(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_x + (get_global_id(1) % FACTOR_1 * FACTOR_2 + get_global_id(1) / FACTOR_1) * dst_stride_y;
+
+    *((__global DATA_TYPE *)dst_addr) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
diff --git a/src/core/CL/cl_kernels/common/convolution_layer.cl b/src/core/CL/cl_kernels/common/convolution_layer.cl
new file mode 100644
index 0000000000..be76929ac8
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/convolution_layer.cl
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(NUM_GROUPS)
+/** This kernel reshapes the tensor's low three dimensions to single column
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=number. e.g. -DNUM_GROUPS=2
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  bias_ptr                           Pointer to the bias tensor. Supported data types: F16/F32, for quantized types this must be nullptr
+ * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  width                              The width of the input tensor
+ * @param[in]  height                             The height of the input tensor
+ * @param[in]  depth                              The depth of the input tensor
+ * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ */
+__kernel void reshape_to_columns(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(bias),
+#endif /* HAS_BIAS */
+    uint width, uint height, uint depth, uint total_filters, uint dst_stride_z)
+{
+    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
+    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
+
+    __global uchar *tmp_src_ptr = src.ptr;
+    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
+                                      2) * width * height * dst_stride_y;
+#ifdef HAS_BIAS
+    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
+#endif /* HAS_BIAS */
+
+    if(is_last_thread)
+    {
+        for(uint g = 0; g < NUM_GROUPS; ++g)
+        {
+            __global uchar *curr_group_dst = tmp_dst_ptr;
+
+            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
+            {
+                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
+
+#ifdef HAS_BIAS
+                *((__global DATA_TYPE *)(curr_group_dst + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
+                tmp_bias_ptr += bias_stride_x;
+#endif /* HAS_BIAS */
+                tmp_src_ptr += depth * src_stride_z;
+                curr_group_dst += dst_stride_x;
+            }
+
+            tmp_dst_ptr += dst_stride_z;
+        }
+    }
+    else
+    {
+        for(uint g = 0; g < NUM_GROUPS; ++g)
+        {
+            __global uchar *curr_group_dst = tmp_dst_ptr;
+
+            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
+            {
+                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
+                tmp_src_ptr += depth * src_stride_z;
+                curr_group_dst += dst_stride_x;
+            }
+
+            tmp_dst_ptr += dst_stride_z;
+        }
+    }
+}
+#endif // defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/copy_tensor.cl b/src/core/CL/cl_kernels/common/copy_tensor.cl
new file mode 100644
index 0000000000..753b98d1b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/copy_tensor.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** Performs a copy of input tensor to the output tensor.
+ *
+ * @note The following variables must be passed at compile time:
+ * -# -DDATA_TYPE        : Input and output datatypes.
+ * -# -DVEC_SIZE         : The number of elements processed in X dimension
+ * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void copy_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Boundary-aware access:
+    // If the there's left-over in width (VEC_SIZE_LEFTOVER > 0):
+    // Shift all accesses other than the first to avoid accessing out of bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)VEC_SIZE_LEFTOVER, 0) % VEC_SIZE;
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(data, DATA_TYPE, (__global DATA_TYPE *)out.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/crop_tensor.cl b/src/core/CL/cl_kernels/common/crop_tensor.cl
new file mode 100644
index 0000000000..d9090dc838
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/crop_tensor.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) // Compile time constants
+
+/** Performs a tensor cropping.
+ *
+ * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  in_offset_y                       The initial offset of the input address along Y.
+ * @param[in]  in_offset_z                       The initial offset of the input address along Z.
+ */
+__kernel void crop_tensor(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out),
+    int in_offset_y,
+    int in_offset_z)
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    const int in_x = get_global_id(0) * (in_step_x / in_stride_x);
+
+#if defined(WIDTH_FLIPPED)
+    const int in_y = in_offset_y - get_global_id(1);
+#else  // defined(WIDTH_FLIPPED)
+    const int in_y                 = in_offset_y + get_global_id(1);
+#endif // defined(WIDTH_FLIPPED)
+
+#if defined(HEIGHT_FLIPPED)
+    const int in_z = in_offset_z - get_global_id(2);
+#else  // defined(HEIGHT_FLIPPED)
+    const int in_z                 = in_offset_z + get_global_id(2);
+#endif // defined(HEIGHT_FLIPPED)
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does then shift access vector to access elements within bounds
+    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
+    in.ptr -= shift * in.stride_x;
+    out.ptr -= shift * out.stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    __global const uchar *input_addr = tensor3D_offset(&in, in_x, in_y, in_z);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)), 0, (__global float *)out.ptr);
+#else  // defined(VEC_SIZE)
+    *((__global float *)(out.ptr)) = CONVERT(*((__global DATA_TYPE *)tensor3D_offset(&in, in_x, in_y, in_z)), float);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // defined(DATA_TYPE) && defined(LAST_ACCESSED_X)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/deconvolution_layer.cl b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
new file mode 100644
index 0000000000..4ac5e3f0e9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void deconvolution_upsample(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Store result
+    *((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
+}
+
+#if defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
+/** This kernel reshapes the deconvolution output tensor before returning the result of the Deconvolution. The decovnolution output tensor
+ * is the result of a @ref CLGEMM operation between the deconvolution input and the deconvolution filter
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g., -DDATA_TYPE=F32
+ * @note The width of the filter should be given as a preprocessor argument using -DFILTER_WIDTH=width, e.g., -DFILTER_WIDTH=2
+ * @note The height of the filter should be given as a preprocessor argument using -DFILTER_HEIGHT=height, e.g., -DFILTER_HEIGHT=2
+ * @note The width of the input should be given as a preprocessor argument using -DSRC_WIDTH=width, e.g., -DSRC_WIDTH=10
+ * @note The height of the input should be given as a preprocessor argument using -DSRC_HEIGHT=width, e.g., -DSRC_HEIGHT=10
+ * @note The output data layout is NHWC if the preprocessor argument NUM_FILTERS is defined, NCHW if NUM_FILTERS is not defined
+ *
+ * @param[in]  src_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  src_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
+ * @param[in]  bias_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32/S32
+ * @param[in]  bias_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
+ */
+__kernel void deconvolution_reshape(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(ADD_BIAS)
+)
+{
+#define FILTER_AREA ((FILTER_WIDTH) * (FILTER_HEIGHT))
+
+    Tensor3D        src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D        dst  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
+    const DATA_TYPE data = *(__global DATA_TYPE *)src.ptr;
+
+    // Store result
+    const int x_in = get_global_id(0);
+    const int y_in = get_global_id(1);
+    const int z_in = get_global_id(2);
+
+#if defined(NUM_FILTERS)
+    const int bias_index = x_in / (FILTER_AREA);
+    const int z_out      = bias_index + (NUM_FILTERS) * (z_in / (SRC_HEIGHT));
+    const int x_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int y_out      = (FILTER_HEIGHT) * (z_in % (SRC_HEIGHT)) + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+#else  // defined(NUM_FILTERS)
+    const int x_out      = x_in / (FILTER_AREA);
+    const int y_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
+    const int z_out      = (FILTER_HEIGHT) * z_in + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
+    const int bias_index = x_out;
+#endif // defined(NUM_FILTERS)
+
+#if defined(ADD_BIAS)
+    Vector          bias     = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+    const DATA_TYPE bias_val = *(__global DATA_TYPE *)vector_offset(&bias, bias_index);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data + bias_val;
+#else  // defined(ADD_BIAS)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data;
+#endif // defined(ADD_BIAS)
+
+#undef FILTER_AREA
+}
+#endif // defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/common/dequantization_layer.cl b/src/core/CL/cl_kernels/common/dequantization_layer.cl
new file mode 100644
index 0000000000..7fa62577ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/dequantization_layer.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void dequantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = SCALE;
+
+    const VEC_DATA_TYPE(int, VEC_SIZE)
+    voffset = OFFSET;
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/elementwise_operation.cl b/src/core/CL/cl_kernels/common/elementwise_operation.cl
new file mode 100644
index 0000000000..45dcbfc6e2
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_operation.cl
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE)
+
+/** List of all the operations supported by this kernel.
+ * @note ADD and SUB operations, when executed on integers, support saturation */
+#ifdef SATURATE
+#define ADD(x, y) add_sat((x), (y))
+#define SUB(x, y) sub_sat((x), (y))
+#else /* SATURATE */
+#define ADD(x, y) (x) + (y)
+#define SUB(x, y) (x) - (y)
+#endif /* SATURATE */
+
+#define MAX(x, y) max(x, y)
+#define MIN(x, y) min(x, y)
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define POWER(x, y) pow(x, y)
+
+#if VEC_SIZE_OUT == 1
+#define PRELU(x, y) (x > 0 ? x : x * y)
+#else // VEC_SIZE_OUT == 1
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))))
+#endif // VEC_SIZE_OUT == 1
+
+#if defined(S32)
+#define DIV(x, y) CONVERT(floor(CONVERT(x, VEC_DATA_TYPE(float, VEC_SIZE_OUT)) / CONVERT(y, VEC_DATA_TYPE(float, VEC_SIZE_OUT))), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
+#else /* S32 */
+#define DIV(x, y) (x / y)
+#endif /* S32 */
+
+#define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
+#define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+#endif // defined(ACTIVATION_TYPE)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_OUT=3. It is defined as the remainder between the input's first dimension and VEC_SIZE_OUT
+ * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2)
+#if !defined(IN_PLACE)
+    ,
+    TENSOR3D_DECLARATION(out)
+#endif // !defined(IN_PLACE)
+)
+{
+#if VEC_SIZE_IN1 == 1
+    uint in1_x_offs = 0;
+#else  // VEC_SIZE_IN1 == 1
+    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
+#endif // VEC_SIZE_IN1 == 1
+#if VEC_SIZE_IN2 == 1
+    uint in2_x_offs = 0;
+#else  // VEC_SIZE_IN2 == 1
+    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
+#endif // VEC_SIZE_IN2 == 1
+#if !defined(IN_PLACE)
+    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+#endif // !defined(IN_PLACE)
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr    = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load values
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
+
+    // Calculate and store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+    res0 = OP(in_a, in_b);
+#if defined(ACTIVATION_TYPE)
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
new file mode 100644
index 0000000000..a11be80875
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define SUB(x, y) (x - y)
+#define ADD(x, y) (x + y)
+#define MAX(x, y) max((x), (y))
+#define MIN(x, y) min((x), (y))
+#define SQUARED_DIFF(x, y) (x - y) * (x - y)
+#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
+#define DIV(x, y) (x / y)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
+#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
+
+#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
+
+/** This function executes an element-wise operation among two tensors.
+ *
+ * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note In case of broadcasting along the X dimension the proper preprocessor argument should be passed depending on the input (e.g. -DIS_IN1_X_BROADCASTING, -DIS_IN2_X_BROADCASTING)
+ * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
+ * @note For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
+ * @note The data type must be passed at compile time using -DDATA_TYPE, i.e. -DDATA_TYPE=uchar
+ *
+ * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16
+ * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void OP_FUN_NAME(OP)(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2)
+#if !defined(IN_PLACE)
+    ,
+    TENSOR3D_DECLARATION(out)
+#endif // !defined(IN_PLACE)
+)
+{
+#if VEC_SIZE_IN1 == 1
+    uint in1_x_offs = 0;
+#else  // VEC_SIZE_IN1 == 1
+    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
+#endif // VEC_SIZE_IN1 == 1
+#if VEC_SIZE_IN2 == 1
+    uint in2_x_offs = 0;
+#else  // VEC_SIZE_IN2 == 1
+    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
+#endif // VEC_SIZE_IN2 == 1
+#if !defined(IN_PLACE)
+    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+#endif // !defined(IN_PLACE)
+
+    // Get pixels pointer
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr    = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_INT);
+
+    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
+    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
+
+    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+    const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+    const VEC_TYPE  res0    = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+
+    // Store result
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl
new file mode 100644
index 0000000000..d2d9d97d33
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+
+// Calculate exponential
+#define exp_op(input) exp(input)
+// Calculate reverse square root
+#define rsqrt_op(input) rsqrt(input)
+// Calculate negative
+#define neg_op(input) (-input)
+// Calculate sine
+#define sin_op(input) sin(input)
+// Calculate abs for floating point values
+#define fabs_op(input) fabs(input)
+// Calculate natural_log
+#define natural_log_op(input) log(input)
+// Calculate round (Cannot use round function as it rounds halfway cases away from zero).
+#if defined(VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, VEC_DATA_TYPE(int, VEC_SIZE), rte), VEC_TYPE)
+#define logical_not_op(input) CONVERT(CONVERT(!input, VEC_TYPE) & ((VEC_TYPE)0x1), VEC_TYPE)
+#else // defined(VEC_SIZE)
+#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, int, rte), DATA_TYPE)
+#define logical_not_op(input) ((!input) & 0x1)
+#endif // defined(VEC_SIZE)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
+ * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary(
+    TENSOR3D_DECLARATION(in),
+    TENSOR3D_DECLARATION(out))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+
+    VSTORE(VEC_SIZE)
+    (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
+#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/common/fft.cl b/src/core/CL/cl_kernels/common/fft.cl
new file mode 100644
index 0000000000..3f26d0f1a6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft.cl
@@ -0,0 +1,1880 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE)
+/** Calculates and applies the twiddle factor to a given input.
+ *
+ * @param[in]     phi   The angle.
+ * @param[in,out] input The input on which the factor should be applied.
+ */
+#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input)  \
+    {                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                \
+        w, tmp;                                    \
+        w.x   = cos(phi);                          \
+        w.y   = sin(phi);                          \
+        tmp.x = (w.x * input.x) - (w.y * input.y); \
+        tmp.y = (w.x * input.y) + (w.y * input.x); \
+        input = tmp;                               \
+    }
+
+/** Computes radix-2 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ */
+#define DFT_2(c0, c1)               \
+    {                               \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        v0;                         \
+        v0 = c0;                    \
+        c0 = v0 + c1;               \
+        c1 = v0 - c1;               \
+    }
+
+// radix-3 butterfly unit factors
+#define SQRT3DIV2 0.86602540378443f
+
+/** Computes radix-3 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ */
+#define DFT_3(c0, c1, c2)                             \
+    {                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
+        v0 = c1 + c2;                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
+        v1   = c1 - c2;                               \
+        c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
+        c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
+        c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
+        c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
+        c0   = c0 + v0;                               \
+    }
+
+/**Computes radix-4 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ */
+#define DFT_4(c0, c1, c2, c3)       \
+    {                               \
+        VEC_DATA_TYPE(DATA_TYPE, 2) \
+        v0, v1, v2, v3;             \
+        v0   = c0 + c2;             \
+        v1   = c1 + c3;             \
+        v2   = c0 - c2;             \
+        v3.x = c1.y - c3.y;         \
+        v3.y = c3.x - c1.x;         \
+        c0   = v0 + v1;             \
+        c2   = v0 - v1;             \
+        c1   = v2 + v3;             \
+        c3   = v2 - v3;             \
+    }
+
+// radix-5 butterfly unit factors
+#define W5_A (DATA_TYPE)0.30901699437494f
+#define W5_B (DATA_TYPE)0.95105651629515f
+#define W5_C (DATA_TYPE)0.80901699437494f
+#define W5_D (DATA_TYPE)0.58778525229247f
+
+/** Computes radix-5 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ */
+#define DFT_5(c0, c1, c2, c3, c4)                                  \
+    {                                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                \
+        v0, v1, v2, v3, v4;                                        \
+        v0 = c0;                                                   \
+        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3);                  \
+        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3);                  \
+        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3);                  \
+        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3);                  \
+        c0 = v0 + c1 + c2 + c3 + c4;                               \
+        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \
+        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v3.y, -v3.x); \
+        c3 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v3.y, v3.x); \
+        c4 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \
+    }
+
+// radix-7 butterfly unit factors
+#define W7_A (DATA_TYPE)0.62348980185873f
+#define W7_B (DATA_TYPE)0.78183148246802f
+#define W7_C (DATA_TYPE)0.22252093395631f
+#define W7_D (DATA_TYPE)0.97492791218182f
+#define W7_E (DATA_TYPE)0.90096886790241f
+#define W7_F (DATA_TYPE)0.43388373911755f
+
+/** Computes radix-7 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ */
+#define DFT_7(c0, c1, c2, c3, c4, c5, c6)                            \
+    {                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                  \
+        v0, v1, v2, v3, v4, v5, v6;                                  \
+        v0 = c0;                                                     \
+        v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
+        v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
+        v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
+        v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
+        v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
+        v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
+        c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6;                       \
+        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x);   \
+        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v5.y, -v5.x);   \
+        c3 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v6.y, -v6.x);   \
+        c4 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v6.y, v6.x);   \
+        c5 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v5.y, v5.x);   \
+        c6 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x);   \
+    }
+
+/** Computes radix-8 butterfly unit.
+ *
+ * @param[in,out] c0 Complex input 0.
+ * @param[in,out] c1 Complex input 1.
+ * @param[in,out] c2 Complex input 2.
+ * @param[in,out] c3 Complex input 3.
+ * @param[in,out] c4 Complex input 4.
+ * @param[in,out] c5 Complex input 5.
+ * @param[in,out] c6 Complex input 6.
+ * @param[in,out] c7 Complex input 7.
+ */
+#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
+    {                                         \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        v0, v1, v2, v3, v4, v5, v6, v7;       \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        s0, s1, s2, s3, s4, s5, s6, s7;       \
+        VEC_DATA_TYPE(DATA_TYPE, 2)           \
+        t0, t1, t2;                           \
+        v0   = c0 + c4;                       \
+        v1   = c1 + c5;                       \
+        v2   = c2 + c6;                       \
+        v3   = c3 + c7;                       \
+        v4   = c0 - c4;                       \
+        v5   = c1 - c5;                       \
+        v6   = c2 - c6;                       \
+        v7   = c3 - c7;                       \
+        s0   = v0 + v2;                       \
+        s1   = v1 + v3;                       \
+        s2   = v0 - v2;                       \
+        s3   = v1 - v3;                       \
+        s4.x = v4.x - v6.y;                   \
+        s4.y = v4.y + v6.x;                   \
+        s5.x = v5.x - v7.y;                   \
+        s5.y = v5.y + v7.x;                   \
+        s6.x = v4.x + v6.y;                   \
+        s6.y = v4.y - v6.x;                   \
+        s7.x = v5.x + v7.y;                   \
+        s7.y = v5.y - v7.x;                   \
+        t0.x = -s3.y;                         \
+        t0.y = s3.x;                          \
+        t1.x = M_SQRT1_2_F * (s5.x - s5.y);   \
+        t1.y = M_SQRT1_2_F * (s5.x + s5.y);   \
+        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);  \
+        t2.y = M_SQRT1_2_F * (s7.x - s7.y);   \
+        c0   = s0 + s1;                       \
+        c1   = s6 - t2;                       \
+        c2   = s2 - t0;                       \
+        c3   = s4 - t1;                       \
+        c4   = s0 - s1;                       \
+        c5   = s6 + t2;                       \
+        c6   = s2 + t0;                       \
+        c7   = s4 + t1;                       \
+    }
+
+/** Computes the first stage of a radix-2 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_2_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data = vload4(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 2
+    DFT_2(data.s01, data.s23);
+
+    // Store two complex output values
+    vstore4(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-2 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_2_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Compute DFT N = 2
+    DFT_2(data1, data2);
+
+    // Store two complex output values
+    vstore2(data1, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_3_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data0 = vload4(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2, 0, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0.s01, data0.s23, data1.s01);
+
+    // Store three complex output values
+    vstore4(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2, 0, 0));
+}
+
+/** Computes the first stage of a radix-3 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_3_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+    // Compute DFT N = 3
+    DFT_3(data0, data1, data2);
+
+    // Store three complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_4_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 4
+    DFT_4(data.s01, data.s23, data.s45, data.s67);
+
+    // Store four complex output values
+    vstore8(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-4 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_4_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+
+    // Compute DFT N = 4
+    DFT_4(data0, data1, data2, data3);
+
+    // Store four complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_5_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
+
+    // Store five complex output values
+    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
+}
+
+/** Computes the first stage of a radix-5 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_5_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+
+    // Compute DFT N = 5
+    DFT_5(data0, data1, data2, data3, data4);
+
+    // Store five complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_7_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data1 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6, 0, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
+
+    // Store seven complex output values
+    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore4(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6, 0, 0));
+}
+
+/** Computes the first stage of a radix-7 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_7_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+
+    // Compute DFT N = 7
+    DFT_7(data0, data1, data2, data3, data4, data5, data6);
+
+    // Store seven complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_8_first_stage_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)input.ptr);
+
+    // Compute DFT N = 8
+    DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
+
+    // Store eight complex output values
+    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
+}
+
+/** Computes the first stage of a radix-8 DFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ */
+__kernel void fft_radix_8_first_stage_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+)
+{
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7, 0));
+
+    // Compute DFT N = 8
+    DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
+
+    // Store eight complex output values
+    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
+    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
+    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
+    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
+    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
+    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
+    vstore2(data7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_2_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-2 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_2_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-2
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load two complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+
+    // Compute DFT N = 2
+    DFT_2(c0, c1);
+
+    // Store two complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_3_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-3 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_3_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-3
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load three complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+
+    // Compute DFT N = 3
+    DFT_3(c0, c1, c2);
+
+    // Store three complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_4_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-4 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_4_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-4
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load four complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+
+    // Compute DFT N = 4
+    DFT_4(c0, c1, c2, c3);
+
+    // Store four complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_5_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-5 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_5_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-5
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load five complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+
+    // Compute DFT N = 5
+    DFT_5(c0, c1, c2, c3, c4);
+
+    // Store five complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_7_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-7 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_7_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-7
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load seven complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+
+    // Compute DFT N = 7
+    DFT_7(c0, c1, c2, c3, c4, c5, c6);
+
+    // Store seven complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 0.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_8_axis_0(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(0);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 7 * Nx, 0, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
+    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 7 * Nx, 0, 0));
+}
+
+/** Computes a stage of a radix-8 FFT on axis 1.
+ *
+ * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
+ *
+ * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
+ * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
+ * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
+ * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
+ * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
+ * @param[in]     Ni                                   Nx * Ny.
+ * @param[in]     exp_const                            Exponent constant
+ */
+__kernel void fft_radix_8_axis_1(
+    TENSOR3D_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(output)
+#endif /* not IN_PLACE */
+    ,
+    uint Nx, uint Ni, float exp_const)
+{
+    // Each work-item computes a single radix-8
+    uint kx = get_global_id(1);
+
+    // Compute nx
+    uint nx = kx % Nx;
+
+    // Compute n index
+    uint n = nx + (kx / Nx) * Ni;
+
+    // Get tensor pointers
+    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
+#ifdef IN_PLACE
+    Tensor3D output = input;
+#else  /* IN_PLACE */
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
+#endif /* IN_PLACE */
+
+    // Load eight complex input values
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7 * Nx, 0));
+
+    // Compute phi
+    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
+
+    // Multiply by twiddle factor
+    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
+    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
+    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
+    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
+    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
+    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
+    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
+
+    // Compute DFT N = 8
+    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
+
+    // Store eight complex output values
+    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
+    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
+    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
+    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
+    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
+    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
+    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
+    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7 * Nx, 0));
+}
+#endif // defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fft_digit_reverse.cl b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
new file mode 100644
index 0000000000..5f64d95bf9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+/** Computes the digit reverse stage on axis X
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_0(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT(idx);
+
+    const unsigned int iidx = *((__global uint *)(idx.ptr));
+
+    // Load data
+#if VEC_SIZE == 1
+    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { data, 0 };
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+}
+
+/** Computes the digit reverse stage on axis Y
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
+ * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
+ * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
+ */
+__kernel void fft_digit_reverse_axis_1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    VECTOR_DECLARATION(idx))
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   idx = CONVERT_TO_VECTOR_STRUCT_NO_STEP(idx);
+
+    const unsigned int iidx = *((__global uint *)vector_offset(&idx, (int)(get_global_id(1))));
+
+    // Load data
+#if VEC_SIZE == 1
+    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Create result
+#if VEC_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { data, 0 };
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = data;
+#else // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+
+    // Store result
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fft_scale.cl b/src/core/CL/cl_kernels/common/fft_scale.cl
new file mode 100644
index 0000000000..c799dd3b9e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fft_scale.cl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE)
+/** Computes the fft scale stage
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        (Optional) dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        (Optional) dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        (Optional) dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ * @param[in]  scale                             Scale to apply to the complex value
+ */
+__kernel void fft_scale_conj(
+    TENSOR3D_DECLARATION(src)
+#ifndef IN_PLACE
+    ,
+    TENSOR3D_DECLARATION(dst)
+#endif /* not IN_PLACE */
+    ,
+    float scale)
+{
+    // Get tensor pointers
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+#if defined(IN_PLACE)
+    Tensor3D dst = src;
+#else  /* IN_PLACE */
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+#endif /* IN_PLACE */
+
+    // Store result
+#if VEC_SIZE == 1
+    *((__global DATA_TYPE *)dst.ptr) = (*(__global DATA_TYPE *)src.ptr) / (DATA_TYPE)scale;
+#elif VEC_SIZE == 2
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    data = vload2(0, (__global DATA_TYPE *)src.ptr);
+    data /= (DATA_TYPE)scale;
+#if defined(CONJ)
+    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(data.s0, -data.s1), 0, (__global DATA_TYPE *)dst.ptr);
+#else  // defined(CONJ)
+    vstore2(data, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // defined(CONJ)
+#else  // VEC_SIZE == 1
+#error "vec_size of 1 and 2 are supported"
+#endif // VEC_SIZE == 1
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/fill_border.cl b/src/core/CL/cl_kernels/common/fill_border.cl
new file mode 100644
index 0000000000..a43343c9f4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/fill_border.cl
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]     width                             Width of the valid region of the image
+ * @param[in]     height                            Height of the valid region of the image
+ * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
+ */
+__kernel void fill_image_borders_replicate(
+    TENSOR3D_DECLARATION(buf),
+    uint width,
+    uint height,
+    int2 start_pos)
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
+        }
+        // Handle right border
+        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
+        }
+    }
+    else
+    {
+        // Get value for corners
+        int val_idx = gidW;
+        if(gidW < 0 || gidW > (width - 1))
+        {
+            val_idx = gidW < 0 ? 0 : width - 1;
+        }
+
+        // Handle top border
+        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
+        }
+        // Handle bottom border
+        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
+        }
+    }
+}
+
+/** Fill N pixels of the padding edge of a single channel image with a constant value.
+ *
+ * @attention  The DATA_TYPE needs to be passed at the compile time.
+ * e.g. -DDATA_TYPE=int
+ *
+ * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
+ * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
+ *
+ * @param[out] buf_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  buf_stride_z                      Stride between images if batching images (in bytes)
+ * @param[in]  buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  width                             Width of the valid region of the image
+ * @param[in]  height                            Height of the valid region of the image
+ * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
+ * @param[in]  constant_value                    Constant value to use to fill the edges
+ */
+__kernel void fill_image_borders_constant(
+    TENSOR3D_DECLARATION(buf),
+    uint      width,
+    uint      height,
+    int2      start_pos,
+    DATA_TYPE constant_value)
+{
+    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
+
+    // Update pointer to point to the starting point of the valid region
+    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
+
+    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
+    const int gid0        = get_global_id(0);
+    const int gidH        = gid0 - total_width;
+    const int gidW        = gid0 - BORDER_SIZE_LEFT;
+
+    if(gidH >= 0)
+    {
+        // Handle left border
+        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
+        }
+        // Handle right border
+        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
+        }
+    }
+    else
+    {
+        // Handle top border
+        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
+        }
+        // Handle bottom border
+        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
+        {
+            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
+        }
+    }
+}
diff --git a/src/core/CL/cl_kernels/common/floor.cl b/src/core/CL/cl_kernels/common/floor.cl
new file mode 100644
index 0000000000..f6dd4edd2e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/floor.cl
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+/** Perform a floor operation on an input tensor.
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void floor_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data0 = floor(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr));
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gather.cl b/src/core/CL/cl_kernels/common/gather.cl
new file mode 100644
index 0000000000..76eaefa92e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gather.cl
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS)
+
+/** Performs the Gather operation along the chosen axis
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
+ * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
+ * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
+ *
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
+ * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: S32/U32.
+ * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
+ * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per work item (in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per work item (in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination tensor
+ */
+__kernel void gather(
+    TENSOR4D_DECLARATION(input),
+    VECTOR_DECLARATION(indices),
+    TENSOR4D_DECLARATION(output))
+{
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pz = get_global_id(2) % OUTPUT_DIM_Z;
+    const int pw = get_global_id(2) / OUTPUT_DIM_Z;
+
+    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
+    const Vector   indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
+    Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
+
+#if AXIS == 0
+    const uint index                 = *(__global const uint *)vector_offset(&indices, px);
+    __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
+#elif AXIS == 1
+    const uint index                 = *(__global const uint *)vector_offset(&indices, py);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
+#elif AXIS == 2
+    const uint index                 = *(__global const uint *)vector_offset(&indices, pz);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
+#elif AXIS == 3
+    const uint index                 = *(__global const uint *)vector_offset(&indices, pw);
+    __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
+#endif //AXIS
+
+    *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
+}
+
+#endif //defined(DATA_TYPE) && defined(AXIS)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
new file mode 100644
index 0000000000..10435d376f
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm.cl
@@ -0,0 +1,4386 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if(SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a)                                                                                                                   \
+    ({                                                                                                                                               \
+        a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
+    })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) \
+    ({})
+#endif // (SRC_WIDTH % K0)
+
+#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                     \
+    ({                                                                                                           \
+        if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0)                                                    \
+        {                                                                                                        \
+            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
+            {                                                                                                    \
+                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
+            }                                                                                                    \
+            else                                                                                                 \
+            {                                                                                                    \
+                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
+            }                                                                                                    \
+        }                                                                                                        \
+        else                                                                                                     \
+        {                                                                                                        \
+            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
+            {                                                                                                    \
+                LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
+            }                                                                                                    \
+            else                                                                                                 \
+            {                                                                                                    \
+                LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);                           \
+            }                                                                                                    \
+        }                                                                                                        \
+    })
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
+                                         TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                         ,
+                                         uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+                                        )
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
+                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src_stride_z by DEPTH_GEMM3D
+
+    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    output_ptr += z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+    // Load values from the LHS matrix
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
+
+    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
+
+    // ---------------------------Store output values ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if M0 == 2
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 3 // M0 == 3
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 4 // M0 == 4
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+    ({                                                                                            \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
+        VSTORE(M0)                                                                                \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+    })
+#elif M0 == 5 // M0 == 5
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                      \
+    ({                                                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
+        res0           = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);           \
+        DATA_TYPE res1 = a4.s##i;                                                                     \
+        VSTORE(4)                                                                                     \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
+        *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
+    })
+#elif M0 == 6 // M0 == 6
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
+    ({                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
+        res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
+        VSTORE(4)                                                                                      \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+        VSTORE(2)                                                                                      \
+        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+    })
+#elif M0 == 7 // M0 == 7
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
+    ({                                                                                                 \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
+        res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
+        VSTORE(4)                                                                                      \
+        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+        VSTORE(3)                                                                                      \
+        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+    })
+#elif M0 == 8 // M0 == 8
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                                      \
+    ({                                                                                                                \
+        VEC_DATA_TYPE(DATA_TYPE, M0)                                                                                  \
+        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
+        VSTORE(M0)                                                                                                    \
+        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));                     \
+    })
+#else // M0 not supported
+#error "M0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
+ * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
+                                        TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                        ,
+                                        uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+                                       )
+{
+    // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
+                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src_stride_z by DEPTH_GEMM3D
+
+    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    output_ptr += z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
+
+    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
+
+    // ---------------------------Transpose and store block -----------------------
+
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
+#if K0 > 2
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
+#endif // K0 > 2
+#if K0 > 3
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
+#endif // K0 > 3
+#if K0 > 4
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
+#endif // K0 > 4
+#if K0 > 8
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
+    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
+#endif // K0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
+
+#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
+                                         TENSOR3D_DECLARATION(dst))
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
+                                     x / (uint)H0)
+                                 * (uint)dst_stride_y)
+                                 + z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+
+    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
+
+    // Load values from the RHS matrix
+    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+#if K0 > 1
+    if(y * (uint)K0 + 1 < SRC_HEIGHT)
+    {
+        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    }
+#endif // K0 > 1
+#if K0 > 2
+    if(y * (uint)K0 + 2 < SRC_HEIGHT)
+    {
+        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+    }
+#endif // K0 > 2
+#if K0 > 3
+    if(y * (uint)K0 + 3 < SRC_HEIGHT)
+    {
+        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+    }
+#endif // K0 > 3
+#if K0 > 4
+    if(y * (uint)K0 + 4 < SRC_HEIGHT)
+    {
+        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+    }
+    if(y * (uint)K0 + 5 < SRC_HEIGHT)
+    {
+        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+    }
+    if(y * (uint)K0 + 6 < SRC_HEIGHT)
+    {
+        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+    }
+    if(y * (uint)K0 + 7 < SRC_HEIGHT)
+    {
+        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+    }
+#endif // K0 > 4
+#if K0 > 8
+    if(y * (uint)K0 + 8 < SRC_HEIGHT)
+    {
+        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+    }
+    if(y * (uint)K0 + 9 < SRC_HEIGHT)
+    {
+        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+    }
+    if(y * (uint)K0 + 10 < SRC_HEIGHT)
+    {
+        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+    }
+    if(y * (uint)K0 + 11 < SRC_HEIGHT)
+    {
+        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+    }
+    if(y * (uint)K0 + 12 < SRC_HEIGHT)
+    {
+        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+    }
+    if(y * (uint)K0 + 13 < SRC_HEIGHT)
+    {
+        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+    }
+    if(y * (uint)K0 + 14 < SRC_HEIGHT)
+    {
+        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+    }
+    if(y * (uint)K0 + 15 < SRC_HEIGHT)
+    {
+        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+    }
+#endif // K0 > 8
+
+    // ---------------------------Store output values ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if defined(TRANSPOSE)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
+ *  the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
+                                        TENSOR3D_DECLARATION(dst))
+{
+    // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+    // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+    // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+    // Compute source and destination addresses
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+    // ------------------ Compute input/output addresses ---------------------------
+
+    // Compute the input address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
+
+    // Compute the output address
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
+                                 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+    // ---------------------------Load input values --------------------------------
+    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
+
+    // Load values from the RHS matrix
+    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    if(y * (uint)K0 + 1 < SRC_HEIGHT)
+    {
+        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    }
+#if K0 > 2
+    if(y * (uint)K0 + 2 < SRC_HEIGHT)
+    {
+        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+    }
+#endif // K0 > 2
+#if K0 > 3
+    if(y * (uint)K0 + 3 < SRC_HEIGHT)
+    {
+        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+    }
+#endif // K0 > 3
+#if K0 > 4
+    if(y * (uint)K0 + 4 < SRC_HEIGHT)
+    {
+        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+    }
+    if(y * (uint)K0 + 5 < SRC_HEIGHT)
+    {
+        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+    }
+    if(y * (uint)K0 + 6 < SRC_HEIGHT)
+    {
+        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+    }
+    if(y * (uint)K0 + 7 < SRC_HEIGHT)
+    {
+        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+    }
+#endif // K0 > 4
+#if K0 > 8
+    if(y * (uint)K0 + 8 < SRC_HEIGHT)
+    {
+        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+    }
+    if(y * (uint)K0 + 9 < SRC_HEIGHT)
+    {
+        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+    }
+    if(y * (uint)K0 + 10 < SRC_HEIGHT)
+    {
+        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+    }
+    if(y * (uint)K0 + 11 < SRC_HEIGHT)
+    {
+        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+    }
+    if(y * (uint)K0 + 12 < SRC_HEIGHT)
+    {
+        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+    }
+    if(y * (uint)K0 + 13 < SRC_HEIGHT)
+    {
+        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+    }
+    if(y * (uint)K0 + 14 < SRC_HEIGHT)
+    {
+        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+    }
+    if(y * (uint)K0 + 15 < SRC_HEIGHT)
+    {
+        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+    }
+#endif // K0 > 8
+
+    // ---------------------------Transpose the block ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
+
+#if K0 == 2
+    // This part computes the following transpositions:
+    // 2x2 -> 2x2
+    // 2x4 -> 4x2
+    // 2x8 -> 8x2
+    // 2x16 -> 16x2
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
+#endif // N0 > 8
+
+#elif K0 == 3 // K0 == 2
+    // This part computes the following transpositions:
+    // 3x2 -> 2x3
+    // 3x4 -> 4x3
+    // 3x8 -> 8x3
+    // 3x16 -> 16x3
+    res0                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
+    res1                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
+#if N0 > 2
+    res2                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
+    res5                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
+    res6                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
+    res7                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
+    res9                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
+    resA                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
+    resB                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
+    resC                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
+    resD                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
+    resE                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
+    resF                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
+#endif // N0 > 8
+
+#elif K0 == 4 // K0 == 4
+    // This part computes the following transpositions:
+    // 4x2 -> 2x4
+    // 4x4 -> 4x4
+    // 4x8 -> 8x4
+    // 4x16 -> 16x4
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
+#endif // N0 > 8
+
+#elif K0 == 8 // K0 == 8
+    // This part computes the following transpositions:
+    // 8x2 -> 2x8
+    // 8x4 -> 4x8
+    // 8x8 -> 8x8
+    // 8x16 -> 16x8
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
+#endif // N0 > 8
+
+#elif K0 == 16 // K0 == 16
+
+    // This part computes the following transpositions:
+    // 16x2 -> 2x16
+    // 16x4 -> 4x16
+    // 16x8 -> 8x16
+    // 16x16 -> 16x16
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
+                                          a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
+                                          a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
+#if N0 > 2
+    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
+                                          a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
+#endif // N0 > 2
+#if N0 > 3
+    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
+                                          a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
+#endif // N0 > 3
+#if N0 > 4
+    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
+                                          a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
+    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
+                                          a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
+    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
+                                          a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
+    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
+                                          a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
+#endif // N0 > 4
+#if N0 > 8
+    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
+                                          a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
+    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
+                                          a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
+    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
+                                          a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
+    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
+                                          a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
+    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
+                                          a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
+    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
+                                          a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
+    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
+                                          a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
+    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
+                                          a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
+#endif // N0 > 8
+
+#else // N0 == 16
+#error "Not supported N0 value"
+#endif // N0 > 2
+
+    // ---------------------------Store the output values ------------------------------
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+    STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(TRANSPOSE)
+#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+#define ARM_DOT2(a, b, c)       \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#define ARM_DOT3(a, b, c)           \
+    ({                              \
+        ARM_DOT2(a, b, c);          \
+        c = fma((a.s2), (b.s2), c); \
+    })
+#define ARM_DOT4(a, b, c)           \
+    ({                              \
+        ARM_DOT3(a, b, c);          \
+        c = fma((a.s3), (b.s3), c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+    ({                             \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##0), (c.s0));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##1), (c.s1));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##2), (c.s2));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##3), (c.s3));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##4), (c.s4));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##5), (c.s5));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##6), (c.s6));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##7), (c.s7));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##8), (c.s8));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##9), (c.s9));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##A), (c.sA));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##B), (c.sB));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##C), (c.sC));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##D), (c.sD));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##E), (c.sE));     \
+        CONCAT(ARM_DOT, k0)        \
+        ((a), (b##F), (c.sF));     \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
+                                          IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                          IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                          IMAGE_DECLARATION(dst),
+                                          uint lhs_stride_z,
+                                          uint rhs_stride_z,
+#if defined(BETA)
+                                          uint bias_stride_z,
+#endif //defined(BETA)
+                                          uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                          ,
+                                          uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                          ,
+                                          uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                         )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS reshaped matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS reshaped matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS reshaped matrix
+        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
+                                                  __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                  IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                  IMAGE_DECLARATION(dst),
+                                                  uint lhs_stride_z,
+                                                  uint rhs_stride_z,
+#if defined(BETA)
+                                                  uint bias_stride_z,
+#endif //defined(BETA)
+                                                  uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                  ,
+                                                  uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                  ,
+                                                  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                 )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+#define LEFTOVER_K (K % K0)
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+#if LEFTOVER_K != 0
+    // Note: We cannot read out-of-bound elements from the RHS matrix because
+    // the RHS width is always multiple of K0. This is not be true for the LHS matrix
+
+    union UNION_VEC_TYPE
+    {
+        DATA_TYPE s[K0];
+        VEC_DATA_TYPE(DATA_TYPE, K0)
+        v;
+    };
+
+    union UNION_VEC_TYPE a0 = {.v = 0 };
+#if M0 > 1
+    union UNION_VEC_TYPE a1 = {.v = 0 };
+#endif // M0 > 1
+#if M0 > 2
+    union UNION_VEC_TYPE a2 = {.v = 0 };
+#endif // M0 > 2
+#if M0 > 3
+    union UNION_VEC_TYPE a3 = {.v = 0 };
+#endif // M0 > 3
+#if M0 > 4
+    union UNION_VEC_TYPE a4 = {.v = 0 };
+#endif // M0 > 4
+#if M0 > 5
+    union UNION_VEC_TYPE a5 = {.v = 0 };
+#endif // M0 > 5
+#if M0 > 6
+    union UNION_VEC_TYPE a6 = {.v = 0 };
+#endif // M0 > 6
+#if M0 > 7
+    union UNION_VEC_TYPE a7 = {.v = 0 };
+#endif // M0 > 7
+
+    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+
+    // Load from RHS matrix
+    LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+    // Load from LHS matrix
+    for(int k = 0; k < LEFTOVER_K; ++k)
+    {
+        a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
+#if M0 > 1
+        a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
+#endif // M0 > 1
+#if M0 > 2
+        a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
+#endif // M0 > 2
+#if M0 > 3
+        a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
+#endif // M0 > 3
+#if M0 > 4
+        a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
+#endif // M0 > 4
+#if M0 > 5
+        a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
+#endif // M0 > 5
+#if M0 > 6
+        a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
+#endif // M0 > 6
+#if M0 > 7
+        a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
+#endif // M0 > 7
+
+        lhs_offset += sizeof(DATA_TYPE);
+    }
+
+    // Accumulate
+    ARM_DOT_K0XN0(K0, a0.v, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(K0, a1.v, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(K0, a2.v, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(K0, a3.v, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(K0, a4.v, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(K0, a5.v, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(K0, a6.v, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(K0, a7.v, b, c7);
+#endif // M0 > 7
+
+#endif // LEFTOVER_K != 0
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef LEFTOVER_K
+#undef PIXEL_UNIT
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    })
+#elif M0 == 2 // M0 == 2
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    })
+#elif M0 == 3 // M0 == 3
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    })
+#elif M0 == 4 // M0 == 4
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    })
+#elif M0 == 5 // M0 == 5
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    })
+#elif M0 == 6 // M0 == 6
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    })
+#elif M0 == 7 // M0 == 7
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    })
+#elif M0 == 8 // M0 == 8
+#define VFMA_M0xN0(i, a, b, c)                                        \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
+                                           IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                           IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                           IMAGE_DECLARATION(dst),
+                                           uint lhs_stride_z,
+                                           uint rhs_stride_z,
+#if defined(BETA)
+                                           uint bias_stride_z,
+#endif //defined(BETA)
+                                           uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                           ,
+                                           uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                           ,
+                                           uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                          )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS reshaped matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(0, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(4, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(5, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(6, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(8, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(9, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(A, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(B, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(C, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(D, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(E, a, b0, c);
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
+        VFMA_M0xN0(0, a, b0, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
+                                                   __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                   IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                   IMAGE_DECLARATION(dst),
+                                                   uint lhs_stride_z,
+                                                   uint rhs_stride_z,
+#if defined(BETA)
+                                                   uint bias_stride_z,
+#endif //defined(BETA)
+                                                   uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                   ,
+                                                   uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                   ,
+                                                   uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                  )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(0, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(1, a, b0, c);
+#if K0 > 2
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(2, a, b0, c);
+#endif // K0 > 2
+#if K0 > 3
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(3, a, b0, c);
+#endif // K0 > 3
+#if K0 > 4
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(4, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(5, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(6, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(7, a, b0, c);
+#endif // K0 > 4
+#if K0 > 8
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(8, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(9, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(A, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(B, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(C, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(D, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(E, a, b0, c);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+        VFMA_M0xN0(F, a, b0, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        VFMA_M0xN0(0, a, b0, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        x_rhs += RHS_STEP_X;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+    ({                      \
+        c += a.s0 * b.s0;   \
+        c += a.s1 * b.s1;   \
+        c += a.s2 * b.s2;   \
+        c += a.s3 * b.s3;   \
+        c += a.s4 * b.s4;   \
+        c += a.s5 * b.s5;   \
+        c += a.s6 * b.s6;   \
+        c += a.s7 * b.s7;   \
+        c += a.s8 * b.s8;   \
+        c += a.s9 * b.s9;   \
+        c += a.sA * b.sA;   \
+        c += a.sB * b.sB;   \
+        c += a.sC * b.sC;   \
+        c += a.sD * b.sD;   \
+        c += a.sE * b.sE;   \
+        c += a.sF * b.sF;   \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else  // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+    })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+    })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+    })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+    })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c)     \
+    ({                          \
+        c = fma(a.s0, b.s0, c); \
+        c = fma(a.s1, b.s1, c); \
+        c = fma(a.s2, b.s2, c); \
+        c = fma(a.s3, b.s3, c); \
+        c = fma(a.s4, b.s4, c); \
+        c = fma(a.s5, b.s5, c); \
+        c = fma(a.s6, b.s6, c); \
+        c = fma(a.s7, b.s7, c); \
+        c = fma(a.s8, b.s8, c); \
+        c = fma(a.s9, b.s9, c); \
+        c = fma(a.sA, b.sA, c); \
+        c = fma(a.sB, b.sB, c); \
+        c = fma(a.sC, b.sC, c); \
+        c = fma(a.sD, b.sD, c); \
+        c = fma(a.sE, b.sE, c); \
+        c = fma(a.sF, b.sF, c); \
+    })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if defined(ARM_DOT_K0XN0)
+#undef ARM_DOT_K0XN0
+#endif // defined(ARM_DOT_K0XN0)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+    })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+    })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+    })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+    })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)           \
+    ({                                   \
+        ARM_DOT_K0((a), (b##0), (c.s0)); \
+        ARM_DOT_K0((a), (b##1), (c.s1)); \
+        ARM_DOT_K0((a), (b##2), (c.s2)); \
+        ARM_DOT_K0((a), (b##3), (c.s3)); \
+        ARM_DOT_K0((a), (b##4), (c.s4)); \
+        ARM_DOT_K0((a), (b##5), (c.s5)); \
+        ARM_DOT_K0((a), (b##6), (c.s6)); \
+        ARM_DOT_K0((a), (b##7), (c.s7)); \
+        ARM_DOT_K0((a), (b##8), (c.s8)); \
+        ARM_DOT_K0((a), (b##9), (c.s9)); \
+        ARM_DOT_K0((a), (b##A), (c.sA)); \
+        ARM_DOT_K0((a), (b##B), (c.sB)); \
+        ARM_DOT_K0((a), (b##C), (c.sC)); \
+        ARM_DOT_K0((a), (b##D), (c.sD)); \
+        ARM_DOT_K0((a), (b##E), (c.sE)); \
+        ARM_DOT_K0((a), (b##F), (c.sF)); \
+    })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst),
+                                            uint k,
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                           )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
+                                                    __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                    IMAGE_DECLARATION(dst),
+                                                    uint k,
+                                                    uint lhs_stride_z,
+                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                    uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                    ,
+                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                   )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X (PIXEL_UNIT * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X PIXEL_UNIT
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                               (get_global_id(2) * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = get_global_id(2);
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+        // Load values from RHS matrix stored in a cl_image
+        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
+        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
+
+        // Accumulate
+        ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+        ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+        ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+        ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+        ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+        ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+        ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+        ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+
+        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
+    ({                                                 \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
+    })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+    })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+    })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+    })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
+    ({                                                    \
+        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+    })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
+// a is the column-vector (transposed)
+// b is the row-vector (not transposed)
+// C is the output matrix
+// Lower case is a vector (a, b)
+// Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
+    ({                                                         \
+        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+    })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
+    ({                                                        \
+        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+    })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
+    (M0, N0, TYPE, A, B, C)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ *
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
+                                            IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst),
+                                            uint k,
+                                            uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif //defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                           )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+    // Compute RHS matrix address
+    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+    for(int i = 0; i < k; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = VLOAD(N0)(0, rhs);
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+        rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+        rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
+                                    2) * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(OPENCL_IMAGE_SUPPORT)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
+ *
+ * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
+ * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
+ *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
+ *       could be different from the value returned by get_image_height(rhs_img).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 4, 8, 16
+ *  - K0 = 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_img                            The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
+                                                    __read_only image2d_t rhs_img,
+#if defined(BETA)
+                                                    IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                                    IMAGE_DECLARATION(dst),
+                                                    uint k,
+                                                    uint lhs_stride_z,
+                                                    uint rhs_stride_z,
+#if defined(BETA)
+                                                    uint bias_stride_z,
+#endif //defined(BETA)
+                                                    uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                    ,
+                                                    uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                   )
+{
+    // Pixel unit
+#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
+
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (PIXEL_UNIT)
+#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (PIXEL_UNIT)
+#endif // defined(RHS_INTERLEAVE)
+
+    const uint x = get_global_id(0);
+    const uint y = get_global_id(1);
+    const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    const uint z_rhs = (z % MATRIX_B_DEPTH);
+#else  // defined(MATRIX_B_DEPTH)
+    const uint z_rhs = z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Compute RHS matrix coordinates
+    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
+    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+
+    for(int i = 0; i < K; i += K0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, M0)
+        a0;
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b0;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+#if K0 > 1
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+
+        a0 = VLOAD(M0)(0, lhs);
+        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
+
+        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+        lhs += LHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+        lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+        x_rhs += K0 * RHS_STEP_X;
+#ifndef RHS_INTERLEAVE
+        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+    ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+    ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+#if defined(MIXED_PRECISION)
+    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#else  // defined(MIXED_PRECISION)
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef PIXEL_UNIT
+#undef LHS_STEP_LOOP
+#undef RHS_STEP_LOOP
+}
+#endif // defined(OPENCL_IMAGE_SUPPORT)
+
+#endif // defined(LHS_TRANSPOSE)
+
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#define VFMA(a, b, c)     \
+    ({                    \
+        c = fma(a, b, c); \
+    })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
+    ({                                                                \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+    })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
+                             IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                             IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                             IMAGE_DECLARATION(dst),
+                             uint lhs_stride_z,
+                             uint rhs_stride_z,
+#if defined(BETA)
+                             uint bias_stride_z,
+#endif //defined(BETA)
+                             uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                             ,
+                             uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                             ,
+                             uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                            )
+{
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        // Supported cases (M0, K0):
+        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+        RHS_VFMA_M0xN0(0, a, b0, c);
+        RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+        RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+        RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+        RHS_VFMA_M0xN0(4, a, b4, c);
+        RHS_VFMA_M0xN0(5, a, b5, c);
+        RHS_VFMA_M0xN0(6, a, b6, c);
+        RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+        RHS_VFMA_M0xN0(8, a, b8, c);
+        RHS_VFMA_M0xN0(9, a, b9, c);
+        RHS_VFMA_M0xN0(A, a, bA, c);
+        RHS_VFMA_M0xN0(B, a, bB, c);
+        RHS_VFMA_M0xN0(C, a, bC, c);
+        RHS_VFMA_M0xN0(D, a, bD, c);
+        RHS_VFMA_M0xN0(E, a, bE, c);
+        RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+        lhs_offset += K0 * sizeof(DATA_TYPE);
+        rhs_offset += K0 * rhs_stride_y;
+    }
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+        VEC_DATA_TYPE(DATA_TYPE, N0)
+        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+        RHS_VFMA_M0xN0(0, a, b, c);
+
+        lhs_offset += sizeof(DATA_TYPE);
+        rhs_offset += rhs_stride_y;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Store output block
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#if defined(BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
+                          TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Load values from A x B
+    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+    // Load values from Matrix C
+    float4 c = vload4(0, (__global float *)src.ptr);
+
+    // Computes alpha * axb + beta * c
+    float4 out = alpha_ab + (float4)BETA * c;
+
+    // Store final result in axb matrix
+    vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
+                          TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Load values from A x B
+    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+    // Load values from Matrix C
+    half8 c = vload8(0, (__global half *)src.ptr);
+
+    // Computes alpha * axb + beta * c
+    half8 out = alpha_ab + (half8)BETA * c;
+
+    // Store final result in axb matrix
+    vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(BETA)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemm_v1.cl b/src/core/CL/cl_kernels/common/gemm_v1.cl
new file mode 100644
index 0000000000..a136a1b96b
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemm_v1.cl
@@ -0,0 +1,3243 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) && defined(IN1_DIM_X)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 4;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global float *src_end_addr_b = src_addr_b + IN1_DIM_X;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float4 c0 = 0.0f;
+    float4 c1 = 0.0f;
+    float4 c2 = 0.0f;
+    float4 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(8 * H0)); src_addr_a += 8 * V0, src_addr_b += 8 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4 * V0);
+        b0 = vload4(0, src_addr_b + 4 * H0);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 4 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        c0 += (float4)a0.s0 * b0;
+        c1 += (float4)a0.s1 * b0;
+        c2 += (float4)a0.s2 * b0;
+        c3 += (float4)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x4 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif //defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                        )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 4;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float4 c0 = 0.0f;
+    float4 c1 = 0.0f;
+    float4 c2 = 0.0f;
+    float4 c3 = 0.0f;
+
+    int i = 0;
+    for(; i <= (int)(K - 4); i += 4)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+    }
+
+    for(; i < (int)K; ++i)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = vload4(0, src_addr_a);
+        float4 b0 = vload4(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 4 * H0;
+
+        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x4 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + IN1_DIM_X;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    half8 c0 = 0.0f;
+    half8 c1 = 0.0f;
+    half8 c2 = 0.0f;
+    half8 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a + 4 * V0);
+        b0 = vload8(0, src_addr_b + 8 * H0);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        c0 += (half8)a0.s0 * b0;
+        c1 += (half8)a0.s1 * b0;
+        c2 += (half8)a0.s2 * b0;
+        c3 += (half8)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst),
+                                                       uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif //defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                      )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    // Compute end row address for matrix B
+    __global half *src_end_addr_b = src_addr_b + IN1_DIM_X;
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    float8 c0 = 0.0f;
+    float8 c1 = 0.0f;
+    float8 c2 = 0.0f;
+    float8 c3 = 0.0f;
+
+    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = convert_float4(vload4(0, src_addr_a));
+        float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = convert_float4(vload4(0, src_addr_a + 4 * V0));
+        b0 = convert_float8(vload8(0, src_addr_b + 8 * H0));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+    }
+
+    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        float4 a0 = convert_float4(vload4(0, src_addr_a));
+        float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+        c0 += (float8)a0.s0 * b0;
+        c1 += (float8)a0.s1 * b0;
+        c2 += (float8)a0.s2 * b0;
+        c3 += (float8)a0.s3 * b0;
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+    float8 bias_f1 = convert_float8(bias1);
+    float8 bias_f2 = convert_float8(bias2);
+    float8 bias_f3 = convert_float8(bias3);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    half8 c_h0 = convert_half8(c0);
+    half8 c_h1 = convert_half8(c1);
+    half8 c_h2 = convert_half8(c2);
+    half8 c_h3 = convert_half8(c3);
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of rows of destination matrix must be passed at compile time using -DM
+ * @note The number of columns of the destination matrix must be passed at compile time using -DN
+ * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst),
+                                                         uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif //defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                        )
+{
+    int x = get_global_id(0) / H0;
+    int y = get_global_id(1) / V0;
+    int z = get_global_id(2);
+
+    // Offset
+    const int offset_row_a = (get_global_id(1) % V0) * 4;
+    const int offset_row_b = (get_global_id(0) % H0) * 8;
+
+    // src_addr_a = address of matrix A
+    // src_addr_b = address of matrix B
+    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+    src_addr_a += offset_row_a;
+    src_addr_b += offset_row_b;
+
+    // Reset accumulators
+    half8 c0 = 0.0f;
+    half8 c1 = 0.0f;
+    half8 c2 = 0.0f;
+    half8 c3 = 0.0f;
+
+    int i = 0;
+    for(; i <= (int)(K - 4); i += 4)
+    {
+#if V0 == 1
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half8 a0 = vload8(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 8 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix B (transposed)
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s4, b0, c0);
+        c1 = fma((half8)a0.s5, b0, c1);
+        c2 = fma((half8)a0.s6, b0, c2);
+        c3 = fma((half8)a0.s7, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload8(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 8 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix B (transposed)
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s4, b0, c0);
+        c1 = fma((half8)a0.s5, b0, c1);
+        c2 = fma((half8)a0.s6, b0, c2);
+        c3 = fma((half8)a0.s7, b0, c3);
+#else  // V0 == 1
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        a0 = vload4(0, src_addr_a);
+        b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+#endif // V0 == 1
+    }
+
+    for(; i < (int)K; ++i)
+    {
+        // Load values from matrix A (interleaved) and matrix B (transposed)
+        half4 a0 = vload4(0, src_addr_a);
+        half8 b0 = vload8(0, src_addr_b);
+
+        src_addr_a += 4 * V0;
+        src_addr_b += 8 * H0;
+
+        c0 = fma((half8)a0.s0, b0, c0);
+        c1 = fma((half8)a0.s1, b0, c1);
+        c2 = fma((half8)a0.s2, b0, c2);
+        c3 = fma((half8)a0.s3, b0, c3);
+    }
+
+    // Compute destination address
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    // Compute dst address
+    __global uchar *dst_addr = offset(&dst, 0, 0);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
+                                    2) * src2_stride_z;
+
+    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store 4x8 block
+    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) && defined(IN1_DIM_X)
+
+#if defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, N0)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
+                                     IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                     IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                     IMAGE_DECLARATION(dst),
+                                     uint src0_stride_z,
+                                     uint src1_stride_z,
+#if defined(BETA)
+                                     uint src2_stride_z,
+#endif //defined(BETA)
+                                     uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                     ,
+                                     uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                     ,
+                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                    )
+{
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    int end_row_vec_a = src_addr.s0 + (K * sizeof(DATA_TYPE));
+
+    VECTOR_TYPE acc0 = 0.0f;
+#if M0 > 1
+    VECTOR_TYPE acc1 = 0.0f;
+#endif // M0 > 1
+#if M0 > 2
+    VECTOR_TYPE acc2 = 0.0f;
+#endif // M0 > 2
+#if M0 > 3
+    VECTOR_TYPE acc3 = 0.0f;
+#endif // M0 > 3
+
+    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(M0, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        VEC_DATA_TYPE(DATA_TYPE, 2)
+        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+        VECTOR_TYPE b1 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+        // Accumulate
+        acc0 += b0 * (VECTOR_TYPE)a0.s0;
+        acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if M0 > 1
+        acc1 += b0 * (VECTOR_TYPE)a1.s0;
+        acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // M0 > 1
+#if M0 > 2
+        acc2 += b0 * (VECTOR_TYPE)a2.s0;
+        acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // M0 > 2
+#if M0 > 3
+        acc3 += b0 * (VECTOR_TYPE)a3.s0;
+        acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // M0 > 3
+    }
+
+    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+        // Accumulate
+        acc0 += b0 * (VECTOR_TYPE)a0;
+#if M0 > 1
+        acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // M0 > 1
+#if M0 > 2
+        acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // M0 > 2
+#if M0 > 3
+        acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // M0 > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                               PARTIAL_STORE_M0)
+                               * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, DATA_TYPE, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+    LOAD_BLOCK(1, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+    // c = c + bias
+    ADD_BLOCK(M0, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(DATA_TYPE)
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
+ * @note This kernel processed a fixed number of elements along x: -DN0=4.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for matrix B
+    src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize accumulators
+    float4 acc0 = 0.0f;
+
+#if M0 > 1
+    float4 acc1 = 0.0f;
+#endif // M0 > 1
+
+#if M0 > 2
+    float4 acc2 = 0.0f;
+#endif // M0 > 2
+
+#if M0 > 3
+    float4 acc3 = 0.0f;
+#endif // M0 > 3
+
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= ((int)K - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A and matrix B
+        LOAD_BLOCK(M0, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A and matrix B
+        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
+
+#if M0 > 1
+
+        acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
+
+#endif // M0 > 1
+#if M0 > 2
+
+        acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
+
+#endif // M0 > 2
+#if M0 > 3
+
+        acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
+
+#if M0 > 1
+
+        acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
+
+#endif // M0 > 1
+#if M0 > 2
+
+        acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
+
+#endif // M0 > 2
+#if M0 > 3
+
+        acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
+
+#if M0 > 1
+
+        acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
+
+#endif // M0 > 1
+#if M0 > 2
+
+        acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
+
+#endif // M0 > 2
+#if M0 > 3
+
+        acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        // Load values from matrix A and matrix B
+        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
+
+#if M0 > 1
+
+        acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
+
+#endif // M0 > 1
+#if M0 > 2
+
+        acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
+
+#endif // M0 > 2
+#if M0 > 3
+
+        acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        src_addr.s0 += 4 * sizeof(float);
+    }
+
+    for(; i < (int)K; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0, b0.s1, acc0.s1);
+        acc0.s2 = fma(a0, b0.s2, acc0.s2);
+        acc0.s3 = fma(a0, b0.s3, acc0.s3);
+#if M0 > 1
+        acc1.s0 = fma(a1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1, b0.s1, acc1.s1);
+        acc1.s2 = fma(a1, b0.s2, acc1.s2);
+        acc1.s3 = fma(a1, b0.s3, acc1.s3);
+#endif // M0 > 1
+#if M0 > 2
+        acc2.s0 = fma(a2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2, b0.s1, acc2.s1);
+        acc2.s2 = fma(a2, b0.s2, acc2.s2);
+        acc2.s3 = fma(a2, b0.s3, acc2.s3);
+#endif // M0 > 2
+#if M0 > 3
+        acc3.s0 = fma(a3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3, b0.s1, acc3.s1);
+        acc3.s2 = fma(a3, b0.s2, acc3.s2);
+        acc3.s3 = fma(a3, b0.s3, acc3.s3);
+#endif // M0 > 3
+
+        src_addr.s0 += sizeof(float);
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                               PARTIAL_STORE_M0)
+                               * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(M0, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, 4, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
+ * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
+ * @note This kernel processed a fixed number of elements along x: -DN0=2.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
+                                                      IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                      IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                      IMAGE_DECLARATION(dst),
+                                                      uint src0_stride_z,
+                                                      uint src1_stride_z,
+#if defined(BETA)
+                                                      uint src2_stride_z,
+#endif //defined(BETA)
+                                                      uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                      ,
+                                                      uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                      ,
+                                                      uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                     )
+{
+    // Requires 2 N0, C vect2, A vect4, B (2 vload2) // to fix for M0 > 1
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize accumulators
+    float2 acc0 = 0.0f;
+#if M0 > 1
+    float2 acc1 = 0.0f;
+#endif // M0 > 1
+#if M0 > 2
+    float2 acc2 = 0.0f;
+#endif // M0 > 2
+#if M0 > 3
+    float2 acc3 = 0.0f;
+#endif // M0 > 3
+
+    // A and B src indices get incremented at the same time.
+    int i = 0;
+    for(; i <= ((int)K - 8); i += 8)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+        acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
+        acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
+        acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
+        acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
+        acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
+        acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
+        acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
+
+        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+        acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
+        acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
+        acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
+        acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
+        acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
+        acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
+        acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
+
+#if M0 > 1
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
+        acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
+        acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
+        acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
+        acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
+        acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
+        acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
+        acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
+
+        acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
+        acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
+        acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
+        acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
+        acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
+        acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
+        acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
+        acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
+#endif // M0 > 1
+#if M0 > 2
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
+        acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
+        acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
+        acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
+        acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
+        acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
+        acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
+        acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
+
+        acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
+        acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
+        acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
+        acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
+        acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
+        acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
+        acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
+        acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
+#endif // M0 > 2
+#if M0 > 3
+#if defined(REINTERPRET_INPUT_AS_3D)
+        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+        acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
+        acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
+        acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
+        acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
+        acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
+        acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
+        acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
+        acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
+
+        acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
+        acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
+        acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
+        acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
+        acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
+        acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
+        acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
+        acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
+#endif // M0 > 3
+
+        src_addr.s0 += sizeof(float) * 8;
+    }
+    // float size increment
+    for(; i < (int)K; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Multiply and accumulate
+        acc0.s0 = fma(a0, b0.s0, acc0.s0);
+        acc0.s1 = fma(a0, b0.s1, acc0.s1);
+#if M0 > 1
+        acc1.s0 = fma(a1, b0.s0, acc1.s0);
+        acc1.s1 = fma(a1, b0.s1, acc1.s1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2.s0 = fma(a2, b0.s0, acc2.s0);
+        acc2.s1 = fma(a2, b0.s1, acc2.s1);
+#endif // M0 > 2
+#if M0 > 3
+        acc3.s0 = fma(a3, b0.s0, acc3.s0);
+        acc3.s1 = fma(a3, b0.s1, acc3.s1);
+#endif // M0 > 3
+
+        src_addr.s0 += sizeof(float);
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                               PARTIAL_STORE_M0)
+                               * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
+
+    LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, float, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(M0, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * 2 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, 2, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
+ * @note This kernel processed a fixed number of elements along x: -DN0=8.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst),
+                                                       uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif //defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                       ,
+                                                       uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                      )
+{
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    float8 acc0 = 0.0h;
+#if M0 > 1
+    float8 acc1 = 0.0h;
+#endif // M0 > 1
+#if M0 > 2
+    float8 acc2 = 0.0h;
+#endif // M0 > 2
+#if M0 > 3
+    float8 acc3 = 0.0h;
+#endif // M0 > 3
+
+    int i = 0;
+    for(; i <= ((int)K - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+
+        // Accumulate
+        acc0 = fma(b0, (float8)a0.s0, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // M0 > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s1, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // M0 > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s2, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // M0 > 3
+
+        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (float8)a0.s3, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // M0 > 3
+
+        src_addr.s0 += 4 * sizeof(half);
+    }
+
+    for(; i < (int)K; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+        src_addr += (int2)(sizeof(half), src1_stride_y);
+
+        // Accumulate
+        acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if M0 > 1
+        acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif                                    // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif                                    // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif                                    // M0 > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+    float8 bias_f0 = convert_float8(bias0);
+#if M0 > 1
+    float8 bias_f1 = convert_float8(bias1);
+#endif // M0 > 1
+#if M0 > 2
+    float8 bias_f2 = convert_float8(bias2);
+#endif // M0 > 2
+#if M0 > 3
+    float8 bias_f3 = convert_float8(bias3);
+#endif // M0 > 3
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(M0, acc, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+    half8 acc_h0 = convert_half8(acc0);
+#if M0 > 1
+    half8 acc_h1 = convert_half8(acc1);
+#endif // M0 > 1
+#if M0 > 2
+    half8 acc_h2 = convert_half8(acc2);
+#endif // M0 > 2
+#if M0 > 3
+    half8 acc_h3 = convert_half8(acc3);
+#endif // M0 > 3
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
+ * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
+ * @note This kernel processed a fixed number of elements along x: -DN0=8.
+ * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
+ * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
+ * @note The optional alpha's value need to be passed at compile time using -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
+ *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
+ *       The activation function is performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                 IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst),
+                                                 uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif //defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                                )
+{
+    int idx = get_global_id(0) * N0;
+
+    // Compute starting address for matrix A and Matrix B
+    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+    // Update address for the matrix A
+    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
+
+    // Update address for the matrix B
+    src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
+    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zin       = min(DEPTH_GEMM3D - 1, zin);
+
+    // Add offset due to the cross plane paddings
+    zin *= (src_cross_plane_pad * src0_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply src0_stride_z by DEPTH_GEMM3D
+    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    half8 acc0 = 0.0h;
+#if M0 > 1
+    half8 acc1 = 0.0h;
+#endif // M0 > 1
+#if M0 > 2
+    half8 acc2 = 0.0h;
+#endif // M0 > 2
+#if M0 > 3
+    half8 acc3 = 0.0h;
+#endif // M0 > 3
+
+    int i = 0;
+    for(; i <= ((int)K - 4); i += 4)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+
+        // Accumulate
+        acc0 = fma(b0, (half8)a0.s0, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // M0 > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s1, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // M0 > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s2, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // M0 > 3
+
+        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+        src_addr.s1 += src1_stride_y;
+        acc0 = fma(b0, (half8)a0.s3, acc0);
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // M0 > 3
+
+        src_addr.s0 += 4 * sizeof(half);
+    }
+
+    for(; i < (int)K; ++i)
+    {
+#if defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if M0 > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // M0 > 1
+#if M0 > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // M0 > 2
+#if M0 > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // M0 > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+        // Load values from matrix A
+        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if M0 > 1
+        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // M0 > 1
+#if M0 > 2
+        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // M0 > 2
+#if M0 > 3
+        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // M0 > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+        // Load values from matrix B
+        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+        src_addr += (int2)(sizeof(half), src1_stride_y);
+
+        // Accumulate
+        acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if M0 > 1
+        acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif                                   // M0 > 1
+#if M0 > 2
+        acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif                                   // M0 > 2
+#if M0 > 3
+        acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif                                   // M0 > 3
+    }
+
+    int z = get_global_id(2);
+
+    // Compute dst address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
+    // in order to take into account the presence of possible cross plane paddings
+    //
+    //  |                  |
+    //  |      plane0      |
+    //  |                  |
+    //  |__________________|
+    //  |******************|
+    //  |  cross_plane_pad |
+    //  |******************|
+    //  |                  |
+    //  |      plane1      |
+    //  |                  |
+    //  |__________________|
+
+    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
+    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
+    zout = min(DEPTH_GEMM3D - 1, zout);
+
+    // Add offset due to the cross plane paddings
+    zout *= (dst_cross_plane_pad * dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+    SCALE_BLOCK(M0, half, acc, ALPHA);
+#endif // defined(ALPHA)
+
+    // Add beta*bias
+#if defined(BETA)
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias[broadcasted]
+    ADD_BLOCK_BROADCAST(M0, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
+                                PARTIAL_STORE_M0)
+                                * src2_stride_y)
+                                + z * src2_stride_z;
+
+    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+    SCALE_BLOCK(M0, half, bias, BETA);
+#endif // UNIT_BIAS
+
+    // acc = acc + bias
+    ADD_BLOCK(M0, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    // Store the output block
+    const bool cond_y = get_global_id(1) == 0;
+    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
new file mode 100644
index 0000000000..5cafb5389c
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
@@ -0,0 +1,2160 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers_asymm.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#define ARM_DOT1(a, b, c)                                                                                                                               \
+    ({                                                                                                                                                  \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
+    })
+#define ARM_DOT2(a, b, c)                                                                                                                               \
+    ({                                                                                                                                                  \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
+    })
+#define ARM_DOT3(a, b, c)                                                                                           \
+    ({                                                                                                              \
+        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
+    })
+#define ARM_DOT4(a, b, c) \
+    ({                    \
+        ARM_DOT(a, b, c); \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
+#define ARM_DOT1(a, b, c)          \
+    ({                             \
+        c += (ACC_DATA_TYPE)a * b; \
+    })
+#define ARM_DOT2(a, b, c)                \
+    ({                                   \
+        c += (ACC_DATA_TYPE)a.s0 * b.s0; \
+        c += (ACC_DATA_TYPE)a.s1 * b.s1; \
+    })
+#define ARM_DOT3(a, b, c)                \
+    ({                                   \
+        ARM_DOT2(a, b, c);               \
+        c += (ACC_DATA_TYPE)a.s2 * b.s2; \
+    })
+#define ARM_DOT4(a, b, c)                \
+    ({                                   \
+        ARM_DOT3(a, b, c);               \
+        c += (ACC_DATA_TYPE)a.s3 * b.s3; \
+    })
+#define ARM_DOT8(a, b, c)            \
+    ({                               \
+        ARM_DOT4((a.lo), (b.lo), c); \
+        ARM_DOT4((a.hi), (b.hi), c); \
+    })
+#define ARM_DOT16(a, b, c)           \
+    ({                               \
+        ARM_DOT8((a.lo), (b.lo), c); \
+        ARM_DOT8((a.hi), (b.hi), c); \
+    })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
+#define ARM_DOT_K0X1(k0, a, b, c)         \
+    ({                                    \
+        ARM_DOT_K0(k0, (a), (b##0), (c)); \
+    })
+#define ARM_DOT_K0X2(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
+        ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
+    })
+#define ARM_DOT_K0X3(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X2(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
+    })
+#define ARM_DOT_K0X4(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X3(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
+    })
+#define ARM_DOT_K0X8(k0, a, b, c)            \
+    ({                                       \
+        ARM_DOT_K0X4(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
+        ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
+        ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
+        ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
+    })
+#define ARM_DOT_K0X16(k0, a, b, c)           \
+    ({                                       \
+        ARM_DOT_K0X8(k0, a, b, c);           \
+        ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
+        ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
+        ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
+        ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
+        ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
+        ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
+        ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
+        ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
+    })
+
+/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_K0XN0X1(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
+    })
+#define ARM_MM_K0XN0X2(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
+    })
+#define ARM_MM_K0XN0X3(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
+    })
+#define ARM_MM_K0XN0X4(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
+    })
+#define ARM_MM_K0XN0X5(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
+    })
+#define ARM_MM_K0XN0X6(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
+    })
+#define ARM_MM_K0XN0X7(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
+    })
+#define ARM_MM_K0XN0X8(n0, k0, a, b, c)           \
+    ({                                            \
+        ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
+        ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
+    })
+
+#define ARM_DOT_K0(k0, a, b, c) \
+    ({                          \
+        CONCAT(ARM_DOT, k0)     \
+        ((a), (b), (c));        \
+    })
+
+#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
+    ({                                 \
+        CONCAT(ARM_DOT_K0X, n0)        \
+        (k0, (a), b, (c));             \
+    })
+
+#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
+    ({                                       \
+        CONCAT(ARM_MM_K0XN0X, m0)            \
+        (n0, k0, a, b, c);                   \
+    })
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
+#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c)   \
+    ({                                           \
+        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
+    })
+#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
+        c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
+    })
+#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
+    })
+#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
+    })
+#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)        \
+    ({                                                \
+        ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
+        c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
+        c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
+        c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
+    })
+#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)       \
+    ({                                                \
+        ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
+        c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
+        c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
+        c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
+        c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
+        c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
+        c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
+        c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
+        c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
+    })
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
+    })
+#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c)    \
+    ({                                                         \
+        ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);   \
+        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
+    })
+#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
+    ({                                              \
+        CONCAT(ARM_MUL_N0X, k0)                     \
+        (VECTOR_ACC_TYPE, (a), b, (c));             \
+    })
+#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
+    ({                                                           \
+        CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                         \
+        (VECTOR_ACC_TYPE, k0, a, b, c);                          \
+    })
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
+ *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
+                                                IMAGE_DECLARATION(rhs),
+                                                IMAGE_DECLARATION(dst),
+                                                uint k,
+                                                uint lhs_stride_z,
+                                                uint rhs_stride_z,
+                                                uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                ,
+                                                uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                               )
+{
+    // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+    // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
+
+    // Compute RHS matrix address
+    __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    for(int i = 0; i < k; i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
+
+        // Partial matrix multiplication M0,N0,K0
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+        // Update address
+        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Convert and store output block
+    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
+    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
+
+    // Store output block
+    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+#define FUSED_OUTPUT_STAGE_FIXED_POINT
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage using fixed-point arithmetic.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed at compile time.
+ *
+ * @param[in]  lhs_ptr                                          Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                                     Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                                     Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes                The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                                          Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                                     Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                                     Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes                The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                                          Pointer to the destination matrix Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                                     Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                                     Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                                     Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                              (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                              (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: S32
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+__kernel void gemmlowp_mm_reshaped_only_rhs_t
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+(IMAGE_DECLARATION(lhs),
+ IMAGE_DECLARATION(rhs),
+ IMAGE_DECLARATION(dst),
+ uint lhs_stride_z,
+ uint rhs_stride_z,
+ uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+ ,
+ uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+ ,
+ uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+#if defined(A_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+ ,
+ IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+ ,
+ VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+#if defined(PER_CHANNEL_QUANTIZATION)
+ ,
+ VECTOR_DECLARATION(result_multipliers),
+ VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+    // @note: replace with (DIMENSION + PAD) once we pass the relevant info at compile time
+#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y)
+#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y)
+
+    // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X (K0 * H0)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0 * N0)
+#define RHS_STEP_X (K0)
+#endif // defined(RHS_INTERLEAVE)
+#define RHS_STEP_LOOP (N0 * K0 * H0)
+
+    uint x  = GET_SPATIAL_IDX(0, 1, 1);
+    uint y  = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
+    uint z  = GET_SPATIAL_IDX(2, 1, 1);
+    int  xo = (x * N0);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((xo >= N) || (y >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_y = y + z * FULL_LHS_HEIGHT;
+
+    // Compute RHS matrix address
+    uint rhs_offset_x = (x % H0) * RHS_OFFSET_X;
+    uint rhs_offset_y = (x / H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset_y += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    int i = 0;
+    for(; i <= (K - K0); i += K0)
+    {
+        TILE(DATA_TYPE, M0, K0, a);
+        TILE(DATA_TYPE, N0, K0, b);
+
+        // Load values from LHS matrix
+        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
+
+        // // Load values from RHS matrix
+        LOOP_UNROLLING(int, _i, 0, 1, N0,
+        {
+            b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X));
+        })
+
+        // Partial matrix multiplication M0,N0,K0
+        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+        rhs_offset_x += RHS_STEP_LOOP;
+    }
+
+#if((K % K0) != 0)
+
+    // Left-over accumulations
+    for(; i < K; ++i)
+    {
+        TILE(DATA_TYPE, M0, 1, a);
+        TILE(DATA_TYPE, N0, 1, b);
+
+        // Load values from LHS matrix
+        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
+
+        LOOP_UNROLLING(int, _i, 0, 1, N0,
+        {
+            b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X);
+        })
+
+        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+        rhs_offset_x += 1;
+    }
+#endif // ((K % K0) != 0)
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(int, M0, N0, c_int);
+    TILE(int, M0, N0, offset_s32);
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
+    })
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
+    })
+
+#if defined(A_OFFSET)
+
+#if defined(SUM_COL_HAS_BATCHES)
+    int sum_col_y = z;
+#else  // defined(SUM_COL_HAS_BATCHES)
+    int sum_col_y = 0;
+#endif // defined(SUM_COL_HAS_BATCHES)
+    TILE(int, 1, N0, a_offset_s32);
+
+    T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32);
+
+    a_offset_s32[0].v *= A_OFFSET;
+
+    T_ADD_BROADCAST_X(int, M0, 1, offset_s32, a_offset_s32, offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    // Compute the offset contribution due to B_OFFSET
+    // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which
+    // does not introduce paddings. For this reason is safe to access the tensor in this manner
+    // without considering that the coordinate "y" could come from an input 3D tensor
+    TILE(int, M0, N0, b_offset_s32);
+
+    T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        offset_s32[i].v += b_offset_s32[i].v *B_OFFSET;
+    })
+
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+
+    TILE(int, 1, N0, bias);
+
+    T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
+
+    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, 1, offset_s32, bias, offset_s32);
+#endif // defined(ADD_BIAS)
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_int[i].v += offset_s32[i].v;
+    })
+
+    TILE(DATA_TYPE, M0, N0, c_lp);
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    TILE(int, 1, N0, res_mul);
+    TILE(int, 1, N0, res_shift);
+
+    T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul);
+    T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift);
+
+    T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp);
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+    T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp);
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+#if defined(MIN_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND);
+    })
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND);
+    })
+#endif // defined(MAX_BOUND)
+
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    TILE(int, M0, N0, c_lp);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
+    })
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+        dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1);
+        dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT;
+        dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D;
+#else  // (REINTERPRET_OUTPUT_AS_3D)
+        dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+    })
+
+    const bool cond_x = (xo > (N - N0));
+
+#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
+#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+    T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
+#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
+
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+#undef RHS_STEP_LOOP
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
+                                 IMAGE_DECLARATION(rhs),
+                                 IMAGE_DECLARATION(dst),
+                                 uint lhs_stride_z,
+                                 uint rhs_stride_z,
+                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                 ,
+                                 uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                 ,
+                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+                                )
+{
+    uint x = get_global_id(0);
+    uint y = get_global_id(1);
+    uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+    if((x * N0 >= N) || (y * M0 >= M))
+    {
+        return;
+    }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+    // Compute LHS matrix address
+    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
+
+    // Compute RHS matrix address
+    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+    rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
+    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply lhs_stride_z by DEPTH_GEMM3D
+    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Initialize the accumulators
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+    int i = 0;
+
+    for(; i <= (K - K0); i += K0)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+        // Partial matrix multiplication M0,N0,K0
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        // Transpose the values from RHS matrix
+        TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
+
+        ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        // Update the offset
+        lhs_offset += K0;
+        rhs_offset += K0 * rhs_stride_y;
+    }
+
+    // Left-over for loop
+    for(; i < K; ++i)
+    {
+        // Load values from LHS matrix
+        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+        // Load values from RHS matrix
+        LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+        // Partial matrix multiplication M0,N0,1
+#if(GPU_ARCH == GPU_ARCH_MIDGARD)
+        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+        // Transpose the values from RHS matrix
+        TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
+
+        ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+        // Update the offset
+        lhs_offset += 1;
+        rhs_offset += rhs_stride_y;
+    }
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
+
+    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
+
+    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+    // multiply dst_stride_z by DEPTH_GEMM3D
+    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+    // Add offset for batched GEMM
+    dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+    const bool cond_y = y == 0;
+    const bool cond_x = ((x + 1) * N0 >= N);
+
+    // Convert and store output block
+    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    sum_row_32            = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
+    ACC_DATA_TYPE sum_row = 0;
+
+    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 16); i += 16)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
+
+        sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF,
+                      VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += (ACC_DATA_TYPE)matrix_a[i];
+    }
+
+    sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+
+#if defined(SCALAR)
+    sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction.
+ * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
+                                               IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    ACC_DATA_TYPE sum_row = 0;
+
+    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
+
+    int i = 0;
+
+    // This for loop performs 16 accumulations
+    for(; i <= ((int)COLS_A - 32); i += 32)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 16)
+        a0 = vload16(0, matrix_a + i);
+
+        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+
+        a0 = vload16(1, matrix_a + i);
+
+        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    }
+
+    // This for loop performs the leftover accumulations
+    for(; i < COLS_A; ++i)
+    {
+        sum_row += (ACC_DATA_TYPE)matrix_a[i];
+    }
+
+#if defined(SCALAR)
+    sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+    *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
+                                          IMAGE_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const uint y      = get_global_id(1);
+
+    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
+    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0;
+
+    int i = 0;
+    // This for loop performs 4 accumulations
+    for(; i <= ((int)ROWS_B - 4); i += 4)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y);
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3,
+                      VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+        matrix_b += 4 * src_stride_y;
+    }
+
+    // This for loop perfoms the leftover accumulations
+    for(; i < (int)ROWS_B; ++i)
+    {
+        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        b0 = VLOAD(VEC_SIZE)(0, matrix_b);
+
+        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+        matrix_b += src_stride_y;
+    }
+
+#if defined(SCALAR)
+    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR;
+#endif // defined(SCALAR)
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE));
+
+    STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+
+/* Helper function used to calculate the offset contribution after matrix multiplication.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in] x                                     max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0)
+ * @param[in] y                                     get_global_id(1)
+ * @param[in] z                                     get_global_id(2)
+ * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
+ */
+inline VEC_INT offset_contribution(
+    int x,
+    int y,
+    int z
+#if defined(A_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+    VEC_INT a_offset_s32 = (VEC_INT)0;
+    VEC_INT b_offset_s32 = (VEC_INT)0;
+
+    int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+    batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+    // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else  // defined(SUM_COL_HAS_BATCHES)
+    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+    a_offset_s32 *= (VEC_INT)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+    // Compute the offset contribution due to A_OFFSET
+    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+    // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+    b_offset_s32 *= (VEC_INT)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    b_offset_s32 += (VEC_INT)biases_values;
+#endif // defined(ADD_BIAS)
+
+    return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
+ * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
+ * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in] biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                           ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                           ,
+                                           VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
+                                          )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32_0 += offset_term_s32;
+
+    // Store the result with the offset contribution
+    STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                         ,
+                                                         IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                         ,
+#if defined(ADD_BIAS)
+                                                         VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                         TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                         ,
+                                                         VECTOR_DECLARATION(result_multipliers),
+                                                         VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+                                                        )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (VEC_INT)RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
+    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
+
+    in_s32 *= result_multipliers_values;
+    in_s32 >>= result_shifts_values;
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+    in_s32 *= RESULT_MULTIPLIER;
+
+    in_s32 >>= RESULT_SHIFT;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                                    ,
+                                                                    IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                                    ,
+#if defined(ADD_BIAS)
+                                                                    VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                    TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                                    ,
+                                                                    VECTOR_DECLARATION(result_multipliers),
+                                                                    VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+                                                                   )
+{
+    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Compute offset contribution
+    VEC_INT offset_term_s32 = offset_contribution(
+                                  x, y, z
+#if defined(A_OFFSET)
+                                  ,
+                                  sum_col_ptr,
+                                  sum_col_stride_x,
+                                  sum_col_step_x,
+                                  sum_col_stride_y,
+                                  sum_col_step_y,
+                                  sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                  sum_row_ptr,
+                                  sum_row_stride_x,
+                                  sum_row_step_x,
+                                  sum_row_stride_y,
+                                  sum_row_step_y,
+                                  sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                  biases_ptr,
+                                  biases_stride_x,
+                                  biases_step_x,
+                                  biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+                              );
+
+    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
+
+    // Add the offset terms to GEMM's result
+    in_s32 += offset_term_s32;
+
+    // -------------- OUTPUT STAGE
+
+    // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
+    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
+
+    VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
+    VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
+    in_s32                   = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+    // Add the offset terms to GEMM's result
+    in_s32 += (VEC_INT)RESULT_OFFSET;
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
+
+#undef VEC_INT
+
+#endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *  -#  - to the [0..255] range and cast to QASYMM8.
+ *  -#  - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Add the offset terms to GEMM's result
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
+
+    // Multiply by result_mult_int and shift
+    input_values *= RESULT_MULT_INT;
+
+#if RESULT_SHIFT < 0
+    input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
+    input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+    // Add the offset terms to GEMM's result
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QSYMM16
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                                     VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                     TENSOR3D_DECLARATION(dst))
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#else  // RESULT_SHIFT >= 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
+#endif // RESULT_SHIFT < 0
+
+    VEC_DATA_TYPE(short, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
+ *       These values can be used to implement "rectified linear unit" activation functions
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                           src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+#if defined(DST_HEIGHT)
+                                                        TENSOR4D_DECLARATION(dst))
+#else  // defined(DST_HEIGHT)
+                                                        TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
+{
+    // Compute source and destination addresses
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+    // Add bias
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
+    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values;
+#endif // defined(ADD_BIAS)
+
+    // Convert to float
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE));
+    input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+
+    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
+
+#if defined(MIN_BOUND)
+    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+    // Store the result
+    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/src/core/CL/cl_kernels/common/gemv.cl b/src/core/CL/cl_kernels/common/gemv.cl
new file mode 100644
index 0000000000..71a372eb29
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/gemv.cl
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
+/** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
+ *
+ * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ */
+__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    DATA_TYPE acc0 = (DATA_TYPE)0;
+    DATA_TYPE acc1 = (DATA_TYPE)0;
+    DATA_TYPE acc2 = (DATA_TYPE)0;
+    DATA_TYPE acc3 = (DATA_TYPE)0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));
+
+        acc0 += dot(weights, tmp0);
+        acc1 += dot(weights, tmp1);
+        acc2 += dot(weights, tmp2);
+        acc3 += dot(weights, tmp3);
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);
+        vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+
+/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: S32
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  input_offset                          Input's quantization offset
+ * @param[in]  weights_offset                        Weights's quantization offset
+ */
+__kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
+                                IMAGE_DECLARATION(weights),
+                                VECTOR_DECLARATION(dst),
+                                const int input_offset,
+                                const int weights_offset)
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+
+    int y = get_global_id(1) * 4;
+    int z = get_global_id(2);
+
+    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
+    __global uchar *input_ptr       = src.ptr;
+
+    int acc0 = 0;
+    int acc1 = 0;
+    int acc2 = 0;
+    int acc3 = 0;
+
+    // This kernel handle 4 rows in per thread so that it can reuse the weights
+    for(int i = 0; i < SRC_WIDTH; i += 4)
+    {
+        int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
+
+        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
+
+        int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;
+        int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;
+        int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;
+        int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;
+
+        // Accumulate
+        acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
+        acc1 += tmp1.s0 * w.s0 + tmp1.s1 * w.s1 + tmp1.s2 * w.s2 + tmp1.s3 * w.s3;
+        acc2 += tmp2.s0 * w.s0 + tmp2.s1 * w.s1 + tmp2.s2 * w.s2 + tmp2.s3 * w.s3;
+        acc3 += tmp3.s0 * w.s0 + tmp3.s1 * w.s1 + tmp3.s2 * w.s2 + tmp3.s3 * w.s3;
+    }
+
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
+
+    int rows_left = SRC_HEIGHT - (y + 4);
+
+    // This if check is used to handle the last few rows when it can't be divided by the four
+    if(rows_left >= 0)
+    {
+        vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global int *)output_ptr);
+    }
+    else
+    {
+        switch(rows_left)
+        {
+            case -1: // three rows left; one is padding
+                *((__global int *)(output_ptr + 2 * dst_stride_x)) = acc2;
+            case -2: // two rows left; two are padding
+                *((__global int *)(output_ptr + 1 * dst_stride_x)) = acc1;
+            case -3: // one row left; three are padding
+                *((__global int *)(output_ptr + 0 * dst_stride_x)) = acc0;
+                break;
+        }
+    }
+}
+#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
diff --git a/src/core/CL/cl_kernels/common/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl
new file mode 100644
index 0000000000..5b8502072a
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/generate_proposals.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: F16/F32
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
+__kernel void generate_proposals_compute_all_anchors(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const size_t idx = get_global_id(0);
+    // Find the index of the anchor
+    const size_t anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const size_t shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const DATA_TYPE
+    shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
+    const DATA_TYPE
+    shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
+
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    anchor = vload4(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS));
+
+    // Apply the shift to the anchor
+    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    vstore4(shifted_anchor, 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
diff --git a/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
new file mode 100644
index 0000000000..70f861c4b7
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
+ * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM8
+ * -# -DHEIGHT= Height of the feature map on which this kernel is applied
+ * -# -DWIDTH= Width of the feature map on which this kernel is applied
+ * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
+ * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
+ * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
+ *
+ * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: QASYMM8
+ * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
+ * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
+ * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
+ * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
+ * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
+ * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
+ * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
+ * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
+ */
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
+__kernel void generate_proposals_compute_all_anchors_quantized(
+    VECTOR_DECLARATION(anchors),
+    VECTOR_DECLARATION(rois))
+{
+    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
+    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
+
+    const size_t idx = get_global_id(0);
+    // Find the index of the anchor
+    const size_t anchor_idx = idx % NUM_ANCHORS;
+
+    // Find which shift is this thread using
+    const size_t shift_idx = idx / NUM_ANCHORS;
+
+    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
+    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
+    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
+
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    shift = (VEC_DATA_TYPE(float, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
+
+    // Read the given anchor
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    anchor = DEQUANTIZE(VLOAD(NUM_ROI_FIELDS)(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS)), OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS);
+
+    // Apply the shift to the anchor
+    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
+    shifted_anchor = anchor + shift;
+
+    VSTORE(NUM_ROI_FIELDS)
+    (QUANTIZE(shifted_anchor, OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS), 0, (__global DATA_TYPE *)rois.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
diff --git a/src/core/CL/cl_kernels/common/instance_normalization.cl b/src/core/CL/cl_kernels/common/instance_normalization.cl
new file mode 100644
index 0000000000..adfbebd67d
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/instance_normalization.cl
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function computes the mean and variance of each plane of the input tensor and provides it as output.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void compute_mean_var(
+    TENSOR4D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int          ch             = get_global_id(0); // Current channel
+    const int          batch          = get_global_id(1); // Current batch
+    const int          elements_plane = DIM_Y * DIM_Z;
+    INTERNAL_DATA_TYPE part_sum       = 0.f;
+    INTERNAL_DATA_TYPE part_sum_sq    = 0.f;
+    const int          in_offset      = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+
+    for(int i_w = 0; i_w < DIM_Y; ++i_w)
+    {
+        for(int i_h = 0; i_h < DIM_Z; ++i_h)
+        {
+            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+    }
+
+    INTERNAL_DATA_TYPE mean                      = (part_sum / elements_plane);
+    INTERNAL_DATA_TYPE var                       = (part_sum_sq / elements_plane) - (mean * mean);
+    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
+    *output_address0                             = mean;
+    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
+    *output_address1                             = var;
+#else // !defined(NHWC)
+    const int ch             = get_global_id(2) % DIM_Z; // Current channel
+    const int batch          = get_global_id(2) / DIM_Z; // Current batch
+    const int elements_plane = DIM_X * DIM_Y;
+
+    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+    part_sum = 0.f;
+    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+    part_sum_sq = 0.f;
+    // Calculate partial sum
+    for(int y = 0; y < DIM_Y; ++y)
+    {
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            // Load data
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
+            part_sum += data;
+            part_sum_sq += data * data;
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)));
+            part_sum.s0 += data;
+            part_sum_sq.s0 += data * data;
+        }
+    }
+    // Perform reduction
+#if VEC_SIZE > 8
+    part_sum.s01234567 += part_sum.s89abcdef;
+    part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
+#endif // VEC_SIZE > 8
+#if VEC_SIZE > 4
+    part_sum.s0123 += part_sum.s4567;
+    part_sum_sq.s0123 += part_sum_sq.s4567;
+#endif // VEC_SIZE > 4
+#if VEC_SIZE > 2
+    part_sum.s01 += part_sum.s23;
+    part_sum_sq.s01 += part_sum_sq.s23;
+#endif // VEC_SIZE > 2
+    part_sum.s0 += part_sum.s1;
+    part_sum_sq.s0 += part_sum_sq.s1;
+
+    INTERNAL_DATA_TYPE sum    = (INTERNAL_DATA_TYPE)part_sum.s0;
+    INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
+
+    const INTERNAL_DATA_TYPE mean = (sum / elements_plane);
+    const INTERNAL_DATA_TYPE var  = (sum_sq / elements_plane) - (mean * mean);
+
+    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
+    *output_address0                             = mean;
+    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
+    *output_address1                             = var;
+
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
+ * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
+ * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void instance_normalization(
+    TENSOR4D_DECLARATION(input),
+    TENSOR3D_DECLARATION(mean_var)
+#ifndef IN_PLACE
+    ,
+    TENSOR4D_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    Tensor4D in       = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var);
+#ifndef IN_PLACE
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+#endif /* IN_PLACE */
+
+#if defined(NHWC)
+    const int ch    = get_global_id(0); // Current channel
+    const int batch = get_global_id(2); // Current batch
+#else                                   /* defined(NHWC) */
+    const int ch    = get_global_id(2) % DIM_Z; // Current channel
+    const int batch = get_global_id(2) / DIM_Z; // Current batch
+#endif                                  /* defined(NHWC) */
+
+    const __global INTERNAL_DATA_TYPE *mean_ptr                   = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch);
+    const __global INTERNAL_DATA_TYPE *var_ptr                    = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch);
+    const INTERNAL_DATA_TYPE                               mean   = (INTERNAL_DATA_TYPE) * mean_ptr;
+    const INTERNAL_DATA_TYPE                               var    = (INTERNAL_DATA_TYPE) * var_ptr;
+    const INTERNAL_DATA_TYPE                               multip = GAMMA / sqrt(var + EPSILON);
+    const INTERNAL_DATA_TYPE                               beta   = (INTERNAL_DATA_TYPE)BETA;
+
+#if defined(NHWC)
+    const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+#ifndef IN_PLACE
+    const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
+#endif /* IN_PLACE */
+
+    for(int i_w = 0; i_w < DIM_Y; ++i_w)
+    {
+        for(int i_h = 0; i_h < DIM_Z; ++i_h)
+        {
+            __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
+#endif /* IN_PLACE */
+            *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+        }
+    }
+#else // !defined(NHWC)
+    for(int y = 0; y < DIM_Y; ++y)
+    {
+        int x = 0;
+        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
+
+            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
+            res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+            VSTORE(VEC_SIZE)
+            (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address);
+        }
+        // Left-overs loop
+        for(; x < DIM_X; ++x)
+        {
+            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
+#ifdef IN_PLACE
+            __global DATA_TYPE *output_address = input_address;
+#else  /* !IN_PLACE */
+            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
+#endif /* IN_PLACE */
+            *(output_address)                  = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
+        }
+    }
+#endif // defined(NHWC)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
diff --git a/src/core/CL/cl_kernels/common/l2_normalize.cl b/src/core/CL/cl_kernels/common/l2_normalize.cl
new file mode 100644
index 0000000000..fbe3406239
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/l2_normalize.cl
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
+/** This kernel performs l2 normalization on x-axis
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_x(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + get_global_id(1) * sum_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(*((__global DATA_TYPE *)sum_addr), epsilon));
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * normalize_value;
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+
+/** This kernel performs l2 normalization on y-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_y(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(sum),
+    IMAGE_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    sums = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(sums, epsilon));
+
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * normalize_value;
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+
+/** This kernel performs l2 normalization on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                         Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                           sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  epsilon                              Epsilon value
+ */
+__kernel void l2_normalize_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(output),
+    DATA_TYPE epsilon)
+{
+    // Offset computation
+    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * sum_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    sums = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    data0 = in * ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))(rsqrt(fmax(sums, epsilon))));
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
new file mode 100644
index 0000000000..05727a6aa6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH)
+/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
+ * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
+ */
+__kernel void mean_stddev_normalization(
+    IMAGE_DECLARATION(input)
+#ifndef IN_PLACE
+    ,
+    IMAGE_DECLARATION(output)
+#endif /* IN_PLACE */
+)
+{
+    // Get pixels pointer
+    Image in = CONVERT_TO_IMAGE_STRUCT(input);
+#ifdef IN_PLACE
+    Image out = in;
+#else  /* IN_PLACE */
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+#endif /* IN_PLACE */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    sum = 0.f;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    sum_sq = 0.f;
+    // Calculate partial sum
+    int i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        // Load data
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
+
+        sum += data;
+        sum_sq += data * data;
+    }
+    // Perform reduction
+    sum    = SUM_REDUCE(sum, VEC_SIZE);
+    sum_sq = SUM_REDUCE(sum_sq, VEC_SIZE);
+
+#if VEC_SIZE > 1
+#define sum sum.s0
+#define sum_sq sum_sq.s0
+#endif // VEC_SIZE > 1
+
+    // Left-overs loop
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
+
+        sum += data;
+        sum_sq += data * data;
+    }
+
+    DATA_TYPE mean       = sum / WIDTH;
+    DATA_TYPE var        = (sum_sq / WIDTH) - (mean * mean);
+    DATA_TYPE stddev_inv = 1.f / sqrt(var + EPSILON);
+
+    i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
+
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        res = (data - mean) * stddev_inv;
+        VSTORE(VEC_SIZE)
+        (res, 0, (__global DATA_TYPE *)offset(&out, i, 0));
+    }
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
+
+        *((__global DATA_TYPE *)offset(&out, i, 0)) = (data - mean) * stddev_inv;
+    }
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH) */
diff --git a/src/core/CL/cl_kernels/common/memset.cl b/src/core/CL/cl_kernels/common/memset.cl
new file mode 100644
index 0000000000..9ff25f3af4
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/memset.cl
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: All.
+ * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] value                                The value used to fill the pages of the tensor
+ */
+__kernel void memset(
+    TENSOR3D_DECLARATION(tensor))
+{
+    Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = (DATA_TYPE)(CONSTANT_VALUE);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else  // !defined(VEC_SIZE)
+    *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/minmax_layer.cl b/src/core/CL/cl_kernels/common/minmax_layer.cl
new file mode 100644
index 0000000000..49356451df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/minmax_layer.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
+/** This function identifies the min and maximum value of an input 3D tensor.
+ *
+ * @note The width, height and depth of the input tensor must be provided at compile time using -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
+ *
+ * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: F32
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
+ * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max vector
+ */
+__kernel void minmax_layer(
+    TENSOR3D_DECLARATION(src),
+    VECTOR_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Vector   dst = CONVERT_TO_VECTOR_STRUCT(dst);
+
+    float4 min_value     = (float4)FLT_MAX;
+    float4 max_value     = (float4) - FLT_MAX;
+    float2 min_max_value = (float2)(FLT_MAX, -FLT_MAX);
+
+    for(int z = 0; z < DEPTH; ++z)
+    {
+        for(int y = 0; y < HEIGHT; ++y)
+        {
+            int             x        = 0;
+            __global float *src_addr = (__global float *)(src.ptr + y * src_stride_y + z * src_stride_z);
+
+            for(; x <= (int)(WIDTH - 8); x += 8)
+            {
+                float8 value = *(src_addr + x);
+
+                min_value = select(value.s0123, min_value, min_value < value.s0123);
+                min_value = select(value.s4567, min_value, min_value < value.s4567);
+
+                max_value = select(value.s0123, max_value, max_value > value.s0123);
+                max_value = select(value.s4567, max_value, max_value > value.s4567);
+            }
+
+            for(; x < WIDTH; ++x)
+            {
+                float value = *(src_addr + x);
+
+                min_max_value.s0 = min(min_max_value.s0, value);
+                min_max_value.s1 = max(min_max_value.s1, value);
+            }
+        }
+    }
+
+    // Perform min/max reduction
+    min_value.s01 = min(min_value.s01, min_value.s23);
+    min_value.s0  = min(min_value.s0, min_value.s1);
+    max_value.s01 = max(max_value.s01, max_value.s23);
+    max_value.s0  = max(max_value.s0, max_value.s1);
+
+    min_max_value.s0 = min(min_max_value.s0, min_value.s0);
+    min_max_value.s1 = max(min_max_value.s1, max_value.s0);
+
+    if(min_max_value.s0 == min_max_value.s1)
+    {
+        min_max_value.s0 = 0.0f;
+        min_max_value.s1 = 1.0f;
+    }
+
+    // Store min and max
+    vstore2(min_max_value, 0, (__global float *)dst.ptr);
+}
+#endif // defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/nonmax.cl b/src/core/CL/cl_kernels/common/nonmax.cl
new file mode 100644
index 0000000000..702e635a89
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/nonmax.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function performs Non maxima suppression over a 3x3 window on a given image.
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p scr_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void non_max_suppression(
+    IMAGE_DECLARATION(src),
+    IMAGE_DECLARATION(dst))
+{
+    Image src = CONVERT_TO_IMAGE_STRUCT(src);
+    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    if(all(vc == (DATA_TYPE)0))
+    {
+        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
+
+        return;
+    }
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
+    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
+
+    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
+    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
+
+    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
+}
diff --git a/src/core/CL/cl_kernels/common/pad_layer.cl b/src/core/CL/cl_kernels/common/pad_layer.cl
new file mode 100644
index 0000000000..5ae4ec884d
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pad_layer.cl
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_SELECT SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSETS VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VEC_SIZE)
+#define SCALAR_COND(x) CONVERT((VEC_SELECT)x == (VEC_SELECT)1, VEC_SELECT)
+
+#if defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
+/** Perform a pad operation when PaddingMode is CONSTANT
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
+ *       -DTHREADS_TO_SKIP_BEFORE compile flag, e.g. -DTHREADS_TO_SKIP_BEFORE=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note In case pad left is more than the vector size, the thread from which to skip along the X axis for pad right must be passed using the
+ *       -DTHREADS_TO_SKIP_AFTER compile flag, e.g. -THREADS_TO_SKIP_AFTER=1. This is defined as ((SRC_WIDTH + PAD_X_BEFORE) / VEC_SIZE)
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If pad also needs to be added to the batch of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
+ *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
+ */
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
+                                 TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+                                 ,
+                                 uint batch
+#endif // defined(PAD_W_BEFORE)
+                                )
+{
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    // If true, write only padding values; no reads performed
+    uint cond = 0;
+#if defined(THREADS_TO_SKIP_BEFORE)
+    cond |= x < THREADS_TO_SKIP_BEFORE || x > THREADS_TO_SKIP_AFTER;
+#endif // defined(THREADS_TO_SKIP_BEFORE)
+#if defined(PAD_Y_BEFORE)
+    cond |= y < PAD_Y_BEFORE || y >= (SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+
+    if(cond)
+    {
+        VEC_TYPE const_vals0 = (VEC_TYPE)CONST_VAL;
+        STORE_VECTOR_SELECT(const_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+    }
+    else
+    {
+        // Calculate input's coordinates based on output's
+        int w = 0;
+#if defined(THREADS_TO_SKIP_BEFORE)
+        x -= THREADS_TO_SKIP_BEFORE;
+#endif // defined(THREADS_TO_SKIP_BEFORE)
+#if defined(PAD_Y_BEFORE)
+        y -= PAD_Y_BEFORE;
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+        z -= PAD_Z_BEFORE;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+        w -= PAD_W_BEFORE * SRC_DEPTH;
+#endif // defined(PAD_W_BEFORE)
+        x *= VEC_SIZE;
+        x -= PAD_X_BEFORE_REMAINDER;
+
+        // Check for out of bound reads and clamp X coordinate
+        uint cond_left  = x < 0;
+        uint cond_right = (x + VEC_SIZE) > SRC_WIDTH;
+        x               = clamp(x, 0, (SRC_WIDTH - VEC_SIZE));
+
+        // Calculate input's address
+        __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_stride_x + y * src_stride_y + z * src_stride_z + w * (int)src_stride_z;
+
+        // Read values and rotate them properly if they would have been across paddings
+        VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, PAD_X_BEFORE_REMAINDER), SCALAR_COND(cond_left));
+        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, VEC_SIZE_LEFTOVER_READ), SCALAR_COND(cond_right));
+
+        // Check what values would be padding and replace them with the constant value
+        VEC_INT xs_out = (VEC_INT)(get_global_id(0) * VEC_SIZE) + VEC_OFFS(int, VEC_SIZE);
+        VEC_INT conds  = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+        src_vals0      = select(src_vals0, (VEC_TYPE)CONST_VAL, CONVERT(conds, VEC_SELECT));
+
+        // Store values in bounds
+        STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+    }
+}
+#endif // defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
+
+#if defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
+ * @note Number of values to the left when operating across left padding must be passed using the -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
+                                          TENSOR3D_DECLARATION(dst))
+{
+    // Get current thread position
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+
+    // Define conditions based on the thread X position w.r.t. pad left and right
+    const int x_out_first         = x * VEC_SIZE;
+    const int x_out_last          = x_out_first + VEC_SIZE;
+    const int is_before_pad_left  = (x_out_last <= PAD_X_BEFORE);
+    const int is_across_pad_left  = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+    const int is_inside_input     = (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_across_pad_right = (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+    const int is_after_pad_right  = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+    // Calculate base pointers
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+    Tensor3D        dst      = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    // Calculate input tensor's offset based on the defined conditions
+    int x_offset = 0;
+    x_offset     = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+    x_offset     = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+    x_offset     = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+    x_offset     = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+    int neg_offs = x_offset < 0;
+    x_offset     = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+    // Load input values from the computed offset
+    int y_in = y;
+    int z_in = z;
+#if defined(PAD_Y_BEFORE)
+    y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+    y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+    z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+    z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+    src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+    VSTORE(VEC_SIZE)
+    ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+    VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+    // Choose rearrangement policy based on the defined conditions
+    src_vals0 = select(src_vals0, SYMM_REFL_LEFT(src_vals0, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
+    src_vals0 = select(src_vals0, SYMM_REFL_RIGHT(src_vals0, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
+    src_vals0 = select(src_vals0, REVERSE(src_vals0, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+    src_vals0 = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+    // Store values in bounds
+    STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
diff --git a/src/core/CL/cl_kernels/common/permute.cl b/src/core/CL/cl_kernels/common/permute.cl
new file mode 100644
index 0000000000..a03eeb1a19
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/permute.cl
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/**Perform a permute operation on an input tensor of Shape DCHW.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void permute(TENSOR4D_DECLARATION(input),
+                      TENSOR4D_DECLARATION(output))
+
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    int out_index[4] = { 0 };
+    int in_index[4]  = { 0 };
+
+    in_index[0] = get_global_id(0);            // W
+    in_index[1] = get_global_id(1);            // H
+    in_index[2] = get_global_id(2) % DEPTH_IN; // C
+    in_index[3] = get_global_id(2) / DEPTH_IN; // B
+
+    out_index[0] = in_index[P1];
+    out_index[1] = in_index[P2];
+    out_index[2] = in_index[P3];
+    out_index[3] = in_index[P4];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
new file mode 100644
index 0000000000..10875293a9
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#ifdef SATURATE
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
+#else /* SATURATE */
+#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
+#endif /* SATURATE */
+#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
+
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#if defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+#endif // defined(ACTIVATION_TYPE)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
+ * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
+ * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_float(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const float scale)
+{
+    // Get pixels pointer
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr      = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_ACC_TYPE);
+
+    // Perform multiplication
+#ifdef DATA_TYPE_FLOAT
+    VEC_OUT_TYPE res0 = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_OUT_TYPE);
+#else  /* DATA_TYPE_FLOAT */
+    VEC_OUT_TYPE res0 = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((CONVERT(in1_data * in2_data, VEC_FLOAT) * scale), VEC_ACC_TYPE, ROUND), VEC_OUT_TYPE, ROUND);
+#endif /* DATA_TYPE_FLOAT */
+
+#if defined(ACTIVATION_TYPE)
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
+
+#if defined(DATA_TYPE)
+
+/** Performs a pixelwise multiplication of complex float values
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F16/F32
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pixelwise_mul_complex(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pixels pointer
+    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
+    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    vin1 = vload2(0, (__global DATA_TYPE *)in1.ptr);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    vin2 = vload2(0, (__global DATA_TYPE *)in2.ptr);
+
+    // Perform complex multiplication
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
+
+#if defined(ACTIVATION_TYPE)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
+#else  // defined(ACTIVATION_TYPE)
+    // Store result
+    vstore2(res, 0, (__global DATA_TYPE *)out.ptr);
+#endif // defined(ACTIVATION_TYPE)
+}
+
+#endif // defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
new file mode 100644
index 0000000000..6d1c2d0c79
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SATURATE)
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
+#else // SATURATE
+#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
+#endif // SATURATE
+#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
+
+#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
+
+#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
+#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with integer scale of integer inputs.
+ *
+ * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
+ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
+ * @attention The data_type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
+ * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Integer scaling factor. Supported data types: S32.
+ */
+__kernel void pixelwise_mul_int(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const uint scale)
+{
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr            = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
+    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
+    // Perform multiplication and store result
+    VEC_OUT_TYPE out_data0 = MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
+    STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
+
+#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
+
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
+
+/** Performs a pixelwise multiplication with float scale of quantized inputs.
+ *
+ * @note The quantization offset of the first operand must be passed at compile time only if asymmetric using -DOFFSET_IN1, e.g. -DOFFSET_IN1=10
+ * @note The quantization offset of the second operand must be passed at compile time only if asymmetric using -DOFFSET_IN2, e.g. -DOFFSET_IN2=10
+ * @note The quantization offset of the output must be passed at compile time only if asymmetric using -DOFFSET_OUT, e.g. -DOFFSET_OUT=10
+ * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, e.g. -DSCALE_IN1=10
+ * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, e.g. -DSCALE_IN2=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, e.g. -DSCALE_OUT=10
+ * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
+ * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
+ * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
+ * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  scale                             Float scaling factor. Supported data types: F32
+ */
+__kernel void pixelwise_mul_quantized(
+    TENSOR3D_DECLARATION(in1),
+    TENSOR3D_DECLARATION(in2),
+#if !defined(IN_PLACE)
+    TENSOR3D_DECLARATION(out),
+#endif // !defined(IN_PLACE)
+    const float scale)
+{
+    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
+    size_t y = get_global_id(1);
+    size_t z = get_global_id(2);
+
+    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
+    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
+    __global        uchar *
+#if !defined(IN_PLACE)
+    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
+#else // !defined(IN_PLACE)
+#if defined(SRC1_IN_PLACE)
+    out_addr            = in1_addr;
+#else  //defined(SRC1_IN_PLACE)
+    out_addr = in2_addr;
+#endif //defined(SRC1_IN_PLACE)
+#endif // !defined(IN_PLACE)
+
+    // Load data
+    VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
+    VEC_INT in_b = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
+
+    // Dequantize
+#if defined(OFFSET_IN1)
+    in_a -= (VEC_INT)((int)OFFSET_IN1);
+#endif // defined(OFFSET_IN1)
+#if defined(OFFSET_IN2)
+    in_b -= (VEC_INT)((int)OFFSET_IN2);
+#endif // defined(OFFSET_IN2)
+    const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
+    const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
+
+#if defined(OFFSET_OUT)
+    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
+#else  // defined(OFFSET_OUT)
+    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
+#endif // defined(OFFSET_OUT)
+    const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
+
+    // Store result
+    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT) */
diff --git a/src/core/CL/cl_kernels/common/pooling_layer.cl b/src/core/CL/cl_kernels/common/pooling_layer.cl
new file mode 100644
index 0000000000..5122f2c251
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pooling_layer.cl
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if STRIDE_X == 1
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
+#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
+#endif /* STRIDE_X == 3 */
+
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+#define POOLING3x3_STRIDE1(res, input, output)                                                                                                       \
+    ({                                                                                                                                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                   \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
+        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                               \
+        data00 = POW2_OP(data00, 4);                                                                                                                 \
+        data01 = POW2_OP(data01, 2);                                                                                                                 \
+        data10 = POW2_OP(data10, 4);                                                                                                                 \
+        data11 = POW2_OP(data11, 2);                                                                                                                 \
+        data20 = POW2_OP(data20, 4);                                                                                                                 \
+        data21 = POW2_OP(data21, 2);                                                                                                                 \
+        \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                              \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                                      \
+        values01 = POOL_OP(values01, values11);                                                                                                      \
+        values00 = POOL_OP(values00, values20);                                                                                                      \
+        values01 = POOL_OP(values01, values21);                                                                                                      \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
+    })
+
+#define POOLING3x3_STRIDE2(res, input, output)                                                                                                       \
+    ({                                                                                                                                               \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data00               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                     \
+        ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8));                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data10               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                     \
+        ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8));                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        data20               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                     \
+        ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8));                                       \
+        data00               = POW2_OP(data00, 8);                                                                                                   \
+        data01               = POW2_OP(data01, 1);                                                                                                   \
+        data10               = POW2_OP(data10, 8);                                                                                                   \
+        data11               = POW2_OP(data11, 1);                                                                                                   \
+        data20               = POW2_OP(data20, 8);                                                                                                   \
+        data21               = POW2_OP(data21, 1);                                                                                                   \
+        \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01);                                                                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11);                                                                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
+        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445);                                                                              \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
+        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21);                                                                           \
+        \
+        values00 = POOL_OP(values00, values10);                                                                                                      \
+        values01 = POOL_OP(values01, values11);                                                                                                      \
+        values00 = POOL_OP(values00, values20);                                                                                                      \
+        values01 = POOL_OP(values01, values21);                                                                                                      \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
+    })
+
+#define POOLING3x3_STRIDE3(res, input, output)                                                                                               \
+    ({                                                                                                                                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                       \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
+        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                           \
+        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
+        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                       \
+        data00 = POW2_OP(data00, 8);                                                                                                         \
+        data01 = POW2_OP(data01, 4);                                                                                                         \
+        data10 = POW2_OP(data10, 8);                                                                                                         \
+        data11 = POW2_OP(data11, 4);                                                                                                         \
+        data20 = POW2_OP(data20, 8);                                                                                                         \
+        data21 = POW2_OP(data21, 4);                                                                                                         \
+        \
+        data00 = POOL_OP(data00, data10);                                                                                                    \
+        data01 = POOL_OP(data01, data11);                                                                                                    \
+        data00 = POOL_OP(data00, data20);                                                                                                    \
+        data01 = POOL_OP(data01, data21);                                                                                                    \
+        \
+        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
+        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03));                                                       \
+    })
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 2);
+    data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+
+    // Perform calculations
+    data0             = POOL_OP(data0, data1);
+    ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average or l2 pooling
+    res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+
+/** Performs a pooling function of pool size equal to 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    // Load data
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+    data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 = POW2_OP(data0, 3);
+    data1 = POW2_OP(data1, 3);
+    data2 = POW2_OP(data2, 3);
+#endif /* defined(POOL_L2) */
+
+    // Perform calculations
+    data0             = POOL_OP(data0, data1);
+    data0             = POOL_OP(data0, data2);
+    ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+
+#if defined(POOLING3x3)
+
+#define CONVERT_OP(data_type) convert_##data_type##4
+#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
+
+VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+                     const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int4       start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
+    int        start_y = get_global_id(1) * stride_y - pad_y;
+    const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
+    const int  end_y   = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max((int4)0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
+}
+
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_optimized_3(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    res;
+
+    // Perform pooling 3x3 for 4 output elements
+    POOLING3x3(res, input, output);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(POOLING3x3)
diff --git a/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
new file mode 100644
index 0000000000..4494dd8cec
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if VEC_SIZE == 2
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 2)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 2) sum) \
+    {                                                                                  \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#elif VEC_SIZE == 4
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 4)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 4) sum) \
+    {                                                                                  \
+        sum.s01 += sum.s23;                                                            \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#elif VEC_SIZE == 8
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 8)
+#define PERFORM_REDUCTION_IMPL(type)                                                   \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 8) sum) \
+    {                                                                                  \
+        sum.s0123 += sum.s4567;                                                        \
+        sum.s01 += sum.s23;                                                            \
+        sum.s0 += sum.s1;                                                              \
+        return sum.s0;                                                                 \
+    }
+#else /* VEC_SIZE DEFAULT */
+#define VEC_SIZE 16
+#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 16)
+#define PERFORM_REDUCTION_IMPL(type)                                                    \
+    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 16) sum) \
+    {                                                                                   \
+        sum.s01234567 += sum.s89abcdef;                                                 \
+        sum.s0123 += sum.s4567;                                                         \
+        sum.s01 += sum.s23;                                                             \
+        sum.s0 += sum.s1;                                                               \
+        return sum.s0;                                                                  \
+    }
+#endif /* VEC_SIZE END */
+
+#define PERFORM_REDUCTION_STR(input, type) perform_reduction_##type(input)
+#define PERFORM_REDUCTION(input, type) PERFORM_REDUCTION_STR(input, type)
+
+PERFORM_REDUCTION_IMPL(int)
+PERFORM_REDUCTION_IMPL(long)
+
+/** Compute quantized multiplier and shift for the inverse square root of input.
+ *  Using 3-bit fixed point and 5 iteration of Newton-Raphson method.
+ *
+ * @param[in] in            Input to use
+ * @param[in] reverse_shift -1 to reverse the shift direction
+ *
+ * @return:
+ *             .s0  Quantized multiplier for inverse square root
+ *             .s1  Shift for inverse square root
+ *
+ */
+inline int2 get_invsqrt_quantized_multiplier_exp(int in, int reverse_shift)
+{
+    int2 stddev_inv;
+    int  stddev_inv_multiplier = INT_MAX;
+    int  stddev_inv_shift      = 0;
+    int  input                 = in;
+    if(input <= 1)
+    {
+        stddev_inv.s0 = stddev_inv_multiplier;
+        stddev_inv.s1 = stddev_inv_shift;
+        return stddev_inv;
+    }
+
+    stddev_inv_shift = 11;
+    while(input >= (1 << 29))
+    {
+        input /= 4;
+        ++stddev_inv_shift;
+    }
+
+    const unsigned int max_left_shift_bits       = clz(input) - 1;
+    const unsigned int max_left_shift_bits_pairs = max_left_shift_bits / 2;
+    const unsigned int left_shift_bit_pairs      = max_left_shift_bits_pairs - 1;
+    stddev_inv_shift -= left_shift_bit_pairs;
+    input <<= 2 * left_shift_bit_pairs;
+
+    typedef int               FixedPointRawType;
+    const unsigned int        fixedpoint_position     = 3;
+    const unsigned int        fixedpoint_int_position = sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
+    typedef FixedPointRawType FixedPoint3;
+    typedef FixedPointRawType FixedPoint0;
+
+    const FixedPoint3 fixedpoint_input      = (input >> 1);
+    const FixedPoint3 fixedpoint_half_input = ASYMM_ROUNDING_DIVIDE_BY_POW2(fixedpoint_input, 1, 1);
+    const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
+    FixedPoint3       x                     = 0x1 << fixedpoint_int_position;
+
+    const int num_iteration = 5;
+    for(int i = 0; i < num_iteration; i++)
+    {
+        int x3 = ASYMM_RESCALE(ASYMM_MULT(ASYMM_MULT(x, x, 1), x, 1), 9, fixedpoint_position, 1);
+        x      = ASYMM_RESCALE(ASYMM_MULT(fixedpoint_half_three, x, 1) - ASYMM_MULT(fixedpoint_half_input, x3, 1), 6, fixedpoint_position, 1);
+    }
+    const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
+    x                                        = ASYMM_MULT(fixedpoint_half_sqrt_2, x, 1);
+    stddev_inv_multiplier                    = x;
+    if(stddev_inv_shift < 0)
+    {
+        stddev_inv_multiplier <<= -stddev_inv_shift;
+        stddev_inv_shift = 0;
+    }
+    stddev_inv_shift *= reverse_shift;
+
+    stddev_inv.s0 = stddev_inv_multiplier;
+    stddev_inv.s1 = stddev_inv_shift;
+    return stddev_inv;
+}
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+/** This function implements QLSTM layer normalization.
+ *
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QSYMM16
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  weight_ptr                           Pointer to the weight tensor. Supported data type: same as @p input_ptr
+ * @param[in]  weight_stride_x                      Stride of the weight tensor in X dimension (in bytes)
+ * @param[in]  weight_step_x                        weight_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weight_offset_first_element_in_bytes The offset of the first element in the weight tensor
+ * @param[in]  bias_ptr                             Pointer to the bias tensor. Supported data type: S32
+ * @param[in]  bias_stride_x                        Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bias_step_x                          bias_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes   The offset of the first element in the biases tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void qlstm_layer_normalization(
+    IMAGE_DECLARATION(input),
+    VECTOR_DECLARATION(weight),
+    VECTOR_DECLARATION(bias),
+    IMAGE_DECLARATION(output))
+{
+    // Get pixels pointer
+    Image  input  = CONVERT_TO_IMAGE_STRUCT(input);
+    Vector weight = CONVERT_TO_VECTOR_STRUCT(weight);
+    Vector bias   = CONVERT_TO_VECTOR_STRUCT(bias);
+    Image  output = CONVERT_TO_IMAGE_STRUCT(output);
+
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    sum = 0;
+    VEC_DATA_TYPE(long, VEC_SIZE)
+    sum_sq = 0;
+    // Calculate partial sum
+    int i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        // Load data
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
+
+        sum += CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE));
+        sum_sq += CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE)) * CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE));
+    }
+    // Perform reduction
+    sum.s0    = PERFORM_REDUCTION(sum, int);
+    sum_sq.s0 = PERFORM_REDUCTION(sum_sq, long);
+
+    // Left-overs loop
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
+
+        sum.s0 += CONVERT(data, int);
+        sum_sq.s0 += CONVERT(data, long) * CONVERT(data, long);
+    }
+
+    int  temp       = 0x100000 / WIDTH;
+    int  mean       = (int)(sum.s0 * 1024 / WIDTH);
+    int  var2       = ((sum_sq.s0 * (long)temp) - ((long)mean * (long)mean)) / 0x100000;
+    int2 stddev_inv = get_invsqrt_quantized_multiplier_exp(var2, -1);
+
+    i = 0;
+    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
+        VEC_DATA_TYPE(int, VEC_SIZE)
+        res = CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE)) * 1024 - mean;
+        res = multiply_by_quantized_multiplier(res, stddev_inv.s0, stddev_inv.s1);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        w   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)vector_offset(&weight, i));
+        res = res * CONVERT(w, VEC_DATA_TYPE(int, VEC_SIZE));
+        res = res + VLOAD(VEC_SIZE)(0, (__global int *)vector_offset(&bias, i));
+        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
+        res = (res + 512) >> 10;
+        res = multiply_by_quantized_multiplier(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12);
+#if defined(MIN_BOUND)
+        res = max(res, (VEC_DATA_TYPE(int, VEC_SIZE))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+        res = min(res, (VEC_DATA_TYPE(int, VEC_SIZE))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+        VSTORE(VEC_SIZE)
+        (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)offset(&output, i, 0));
+    }
+    for(; i < WIDTH; ++i)
+    {
+        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
+        int res        = (int)data * 1024 - mean;
+        res            = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, stddev_inv.s0, stddev_inv.s1, 1);
+        DATA_TYPE w    = *((__global DATA_TYPE *)vector_offset(&weight, i));
+        res            = res * (int)w;
+        int b          = *((__global int *)vector_offset(&bias, i));
+        res            = res + b;
+        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
+        res = (res + 512) >> 10;
+        res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12, 1);
+#if defined(MIN_BOUND)
+        res = max(res, MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+        res = min(res, MAX_BOUND);
+#endif // defined(MAX_BOUND)
+        *((__global DATA_TYPE *)offset(&output, i, 0)) = (DATA_TYPE)res;
+    }
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl
new file mode 100644
index 0000000000..69cc288c25
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/quantization_layer.cl
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
+#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
+
+/** This performs the quantization of floating point inputs or 8-bit quantized integers to 8-bit integers.
+ *
+ * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g. -DDATA_TYPE=short
+ * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g. -DSCALE=0.125
+ * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g. -DOFFSET=125
+ * @note Minimum value for quantized type should be given as a preprocessor argument using -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
+ * @note Maximum value for quantized type should be given as a preprocessor argument using -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
+ * @note If the input data type if a floating point (F16 or F32) the preprocessor argument should be give as -DIS_FLOAT
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void quantization_layer(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+#if defined(IS_FLOAT)
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    val_float = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset         = OFFSET;
+#else  // defined(IS_FLOAT)
+    // Load data
+    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
+
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    val_float = CONVERT(val, VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Create scale and offset vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE) vscale = SCALE;
+    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
+#endif // defined(IS_FLOAT)
+
+    // Quantize
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL);
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
+#else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, MIN_QUANT_VAL, MAX_QUANT_VAL);
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
diff --git a/src/core/CL/cl_kernels/common/range.cl b/src/core/CL/cl_kernels/common/range.cl
new file mode 100644
index 0000000000..d25d10e207
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/range.cl
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
+
+#if !defined(OFFSET_OUT) && !defined(SCALE_OUT)
+
+#if VECTOR_SIZE == 2
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 2))(0, STEP))
+#elif VECTOR_SIZE == 3
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 3))(0, STEP, 2 * STEP))
+#elif VECTOR_SIZE == 4
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 4))(0, STEP, 2 * STEP, 3 * STEP))
+#elif VECTOR_SIZE == 8
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
+#elif VECTOR_SIZE == 16
+#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
+#endif // VECTOR_SIZE == 2
+
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=value. e.g. -DDATA_TYPE=4
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ *
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32.
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range(
+    VECTOR_DECLARATION(out))
+{
+    uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+    __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+    DATA_TYPE seq;
+    seq = (DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP;
+
+    *(__global DATA_TYPE *)dst_ptr = seq;
+#else  // VECTOR_SIZE == 1
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    seq0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
+    seq0 = seq0 + STEP_VEC;
+    STORE_VECTOR_SELECT(seq, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif //VECTOR_SIZE == 1
+}
+
+#else // !defined(OFFSET_OUT) && !defined(SCALE_OUT)
+
+#if VECTOR_SIZE == 2
+#define STEP_VEC ((VEC_DATA_TYPE(float, 2))(0, STEP))
+#elif VECTOR_SIZE == 3
+#define STEP_VEC ((VEC_DATA_TYPE(float, 3))(0, STEP, 2 * STEP))
+#elif VECTOR_SIZE == 4
+#define STEP_VEC ((VEC_DATA_TYPE(float, 4))(0, STEP, 2 * STEP, 3 * STEP))
+#elif VECTOR_SIZE == 8
+#define STEP_VEC ((VEC_DATA_TYPE(float, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
+#elif VECTOR_SIZE == 16
+#define STEP_VEC ((VEC_DATA_TYPE(float, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
+#endif // VECTOR_SIZE == 2
+
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+
+/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
+ *
+ * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
+ * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=vector_size. e.g. -DDATA_TYPE=4
+ * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
+ * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
+ *
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8.
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void range_quantized(
+    VECTOR_DECLARATION(out))
+{
+    uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+    __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
+#if VECTOR_SIZE == 1
+    float           seq;
+    seq                            = (float)START + (float)id * (float)STEP;
+    seq                            = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
+    seq                            = max(0.0f, min(seq, 255.0f));
+    *(__global DATA_TYPE *)dst_ptr = CONVERT_SAT(CONVERT_DOWN(seq, int), DATA_TYPE);
+#else  // VECTOR_SIZE == 1
+    VEC_DATA_TYPE(float, VECTOR_SIZE)
+    seq = (float)START + id * (float)STEP;
+    seq = seq + STEP_VEC;
+    seq = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
+    seq = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    res0 = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+#endif // VECTOR_SIZE == 1
+}
+#endif // !defined(OFFSET_OUT) && !defined(SCALE_OUT)
+
+#endif // defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
diff --git a/src/core/CL/cl_kernels/common/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
new file mode 100644
index 0000000000..9f2c6e23b5
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+#if defined(FLOAT_DATA_TYPE)
+#define ISGREATER(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isgreater(x, y))
+#define ISLESS(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isless(x, y))
+#define ISGREATER_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isgreater(x, y))
+#define ISLESS_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isless(x, y))
+#else // !FLOAT_DATA_TYPE
+#if defined(WIDTH)
+#define ISGREATER(x, y) (x > y) ? 1 : 0
+#define ISLESS(x, y) (x < y) ? 1 : 0
+#define ISGREATER_SCALAR ISGREATER
+#define ISLESS_SCALAR ISLESS
+#else // !defined(WIDTH)
+#define ISGREATER(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x > y)
+#define ISLESS(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x < y)
+#endif // defined(WIDTH)
+#endif // defined(FLOAT_DATA_TYPE)
+
+#if defined(WIDTH)
+#if defined(OPERATION)
+
+#define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
+#define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
+#define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
+
+/** This kernel performs parallel reduction given an operation on x-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
+ * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input
+ * @param[in] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reduction_operation_x(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
+
+#if defined(PROD)
+    DATA_TYPE res = (DATA_TYPE)1;
+#else  // defined(PROD)
+    DATA_TYPE res = (DATA_TYPE)0;
+#endif // defined(PROD)
+
+    int x = 0;
+
+    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
+        res  = OPERATION(res, vals, VEC_SIZE);
+    }
+
+#if(WIDTH % VEC_SIZE)
+    _Pragma("unroll") for(; x < WIDTH; ++x)
+    {
+        DATA_TYPE val = *((__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
+        res           = OPERATION(res, val, 1);
+    }
+#endif // (WIDTH % VEC_SIZE)
+
+#if defined(MEAN)
+    res /= WIDTH;
+#endif // defined(MEAN)
+    *((__global DATA_TYPE *)output_addr) = res;
+}
+#endif // defined(OPERATION)
+/** This kernel performs reduction on x-axis. (Non parallel)
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
+ * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_non_parallel_x(
+    VECTOR_DECLARATION(input),
+    VECTOR_DECLARATION(output))
+{
+    Vector input  = CONVERT_TO_VECTOR_STRUCT(input);
+    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
+
+    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, 0)), DATA_TYPE_PROMOTED);
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    for(unsigned int x = 1; x < WIDTH; ++x)
+    {
+        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, x)), DATA_TYPE_PROMOTED);
+#if defined(MIN)
+        res = select(res, in, ISLESS_SCALAR(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER_SCALAR(in, res));
+#elif defined(PROD)
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+#else  // defined(SUM))
+        res += in;
+#endif // defined(MAX) || defined(MIN) || defined(PROD)
+    }
+
+    // Store result
+#if defined(MEAN)
+    res /= WIDTH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (WIDTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
+}
+#endif // defined(WIDTH)
+
+#if defined(HEIGHT)
+/** This kernel performs reduction on y-axis.
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_y(
+    IMAGE_DECLARATION(input),
+    IMAGE_DECLARATION(output))
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int y = 1; y < HEIGHT; ++y)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif // defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= HEIGHT;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (HEIGHT - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(HEIGHT)
+
+#if defined(DEPTH)
+/** This kernel performs reduction on z-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_z(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int z = 1; z < DEPTH; ++z)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif // defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= DEPTH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (DEPTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DEPTH) */
+
+#if defined(BATCH) && defined(DEPTH)
+/** This kernel performs reduction on w-axis.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
+ * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
+ *
+ * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
+ * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
+ * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
+ * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
+ * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
+ * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
+ */
+__kernel void reduction_operation_w(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int y = get_global_id(1);
+    int z = get_global_id(2);
+
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + (z % DEPTH) * input_stride_z + (z / DEPTH) * input_stride_w;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + (z % DEPTH) * output_stride_z + (z / DEPTH) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+#if defined(SUM_SQUARE)
+    res *= res;
+#endif // defined(SUM_SQUARE)
+
+    for(unsigned int w = 1; w < BATCH; ++w)
+    {
+        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
+        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
+
+#if defined(MIN)
+        res = select(res, in, ISLESS(in, res));
+#elif defined(MAX)
+        res = select(res, in, ISGREATER(in, res));
+#else // !(defined(MAX) || defined(MIN))
+#if defined(SUM_SQUARE)
+        in *= in;
+#endif // defined(SUM_SQUARE)
+#if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
+        res += in;
+#endif //defined(PROD)
+#endif // defined(MAX) || defined(MIN)
+    }
+
+#if defined(MEAN)
+    res /= BATCH;
+#endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (BATCH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/common/reshape_layer.cl b/src/core/CL/cl_kernels/common/reshape_layer.cl
new file mode 100644
index 0000000000..bfdefc863e
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reshape_layer.cl
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_shape                          Input spatial shape
+ * @param[in]  output_shape                         Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input),
+                            TENSOR3D_DECLARATION(output),
+                            int2 input_shape,
+                            int2 output_shape)
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+    int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+
+    // Linearize index
+    int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
+
+    // Translate to output
+    int3 out_id;
+    out_id.x = linear_idx % output_shape.x;
+    out_id.y = (linear_idx / output_shape.x) % output_shape.y;
+    out_id.z = linear_idx / (output_shape.x * output_shape.y);
+
+    // Store result
+    *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = *((__global DATA_TYPE *)in.ptr);
+}
diff --git a/src/core/CL/cl_kernels/common/reverse.cl b/src/core/CL/cl_kernels/common/reverse.cl
new file mode 100644
index 0000000000..6b0afb9c2c
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/reverse.cl
@@ -0,0 +1,102 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
+
+#if NUM_REVERSE_DIMS > 4
+#error("Reversing more than 4 dimensions is not currently supported")
+#endif /* NUM_REVERSE_DIMS > 4 */
+
+/** Performs reverse along the specified axis.
+ *
+ * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
+ * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  axis_ptr                           Pointer to the source vector. Supported data types: U32
+ * @param[in]  axis_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  axis_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  axis_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ */
+__kernel void reverse(TENSOR4D_DECLARATION(src),
+                      VECTOR_DECLARATION(axis),
+                      TENSOR4D_DECLARATION(dst),
+                      const uint width,
+                      const uint height,
+                      const uint depth,
+                      const uint batches)
+{
+    Tensor4D src  = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
+    Vector   axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
+    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, depth);
+
+    const uint x_in = get_global_id(0);
+    const uint y_in = get_global_id(1);
+    const uint z_in = get_global_id(2) % depth;
+    const uint w_in = get_global_id(2) / depth;
+
+    const uint4 dims       = (uint4)(0, 1, 2, 3);
+    int4        to_reverse = (int4)(0, 0, 0, 0);
+#if NUM_REVERSE_DIMS == 1
+    const uint index = *((__global uint *)axis.ptr);
+    to_reverse       = (uint4)index == dims;
+#elif NUM_REVERSE_DIMS == 2
+    const uint2 indices = vload2(0, (__global uint *)axis.ptr);
+    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
+#elif NUM_REVERSE_DIMS == 3
+    const uint2 indices01 = vload2(0, (__global uint *)axis.ptr);
+    const uint index2     = *((__global uint *)axis.ptr + 2);
+    to_reverse            = ((uint4)indices01.s0 == dims) || ((uint4)indices01.s1 == dims) || ((uint4)index2 == dims);
+#else  /* NUM_REVERSE_DIMS == 3 */
+    const uint4 indices = vload4(0, (__global uint *)axis.ptr);
+    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
+#endif /* NUM_REVERSE_DIMS == 1 */
+    const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
+    const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
+    const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
+    const uint w_out = to_reverse.s3 ? batches - w_in - 1 : w_in;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&dst, x_out, y_out, z_out, w_out)) = *((__global DATA_TYPE *)src.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
diff --git a/src/core/CL/cl_kernels/common/roi_align_layer.cl b/src/core/CL/cl_kernels/common/roi_align_layer.cl
new file mode 100644
index 0000000000..8cfe5ddcb6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/roi_align_layer.cl
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
+            sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
+        }
+    }
+
+    return (DATA_TYPE)(sum / (grid_size_x * grid_size_y));
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16, F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    roi                 = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
+    const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
+    const float2 roi_dims   = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    const float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2       region_start = spatial_indx * bin_size + roi_anchor;
+    float2       region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    const float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    const float2 roi_bin_grid           = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
new file mode 100644
index 0000000000..e75dee06f6
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
+// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
+#define EPS_GRID 0.00001f
+
+#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) && defined(OFFSET_IN) && defined(OFFSET_OUT) && defined(SCALE_IN) && defined(SCALE_OUT) && defined(OFFSET_ROIS) && defined(SCALE_ROIS) // Check for compile time constants
+
+/** Performs a roi align on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return An average pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
+                               float bin_size_x,
+                               float grid_size_x,
+                               float region_end_x,
+                               float region_start_y,
+                               float bin_size_y,
+                               float grid_size_y,
+                               float region_end_y,
+                               int   pz)
+{
+    // Iterate through the pooling region
+    float sum = 0;
+    for(int iy = 0; iy < grid_size_y; ++iy)
+    {
+        for(int ix = 0; ix < grid_size_x; ++ix)
+        {
+            // Align the window in the middle of every bin
+            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
+            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
+
+            // Interpolation in the unit square
+            const int y_low  = (int)y;
+            const int x_low  = (int)x;
+            const int y_high = y_low + 1;
+            const int x_high = x_low + 1;
+
+            const float ly = y - y_low;
+            const float lx = x - x_low;
+            const float hy = 1.f - ly;
+            const float hx = 1.f - lx;
+
+            const float w1 = hy * hx;
+            const float w2 = hy * lx;
+            const float w3 = ly * hx;
+            const float w4 = ly * lx;
+#if defined(NHWC)
+            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
+            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
+            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
+            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
+#else  // !defined(NHWC)
+            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
+            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
+            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
+            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
+#endif // defined(NHWC)
+
+            const float data1_f32 = DEQUANTIZE(data1, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data2_f32 = DEQUANTIZE(data2, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data3_f32 = DEQUANTIZE(data3, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            const float data4_f32 = DEQUANTIZE(data4, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
+            sum += w1 * data1_f32 + w2 * data2_f32 + w3 * data3_f32 + w4 * data4_f32;
+        }
+    }
+
+    const float res_f32 = sum / (grid_size_x * grid_size_y);
+    return QUANTIZE(res_f32, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
+}
+
+/** Performs a roi align function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=uchar
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
+ *       will have a default sampling ratio of roi_dims/pooling_dims
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }.
+ *                                                  Supported data types: QASYMM16 with 0.125f scale and 0 offset
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void roi_align_layer_quantized(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+#if defined(NHWC)
+    const int px = get_global_id(1);
+    const int py = get_global_id(2);
+    const int pw = get_global_id(0);
+#else  // !defined(NHWC)
+    const int                                  px = get_global_id(0);
+    const int                                  py = get_global_id(1);
+    const int                                  pw = get_global_id(2);
+#endif // defined(NHWC)
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = *((__global ushort *)offset(&rois, 0, pw));
+    float4 roi             = DEQUANTIZE(vload4(0, (__global ushort *)offset(&rois, 1, pw)), OFFSET_ROIS, SCALE_ROIS, ushort, 4);
+    float2 roi_anchor      = roi.s01 * convert_float(SPATIAL_SCALE);
+    float2 roi_dims        = fmax((roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
+
+    // Calculate pooled region start and end
+    float2 spatial_indx     = (float2)(px, py);
+    float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
+
+    float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
+    float2 region_start = spatial_indx * bin_size + roi_anchor;
+    float2 region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+#if defined(SAMPLING_RATIO)
+    float2 roi_bin_grid = SAMPLING_RATIO;
+#else  // !defined(SAMPLING_RATIO)
+    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
+    float2       roi_bin_grid           = ceil(bin_size - EPS_GRID);
+#endif // defined(SAMPLING_RATIO)
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+#if defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
+#else  // !defined(NHWC)
+        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+#endif // defined(NHWC)
+        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
+                                                         region_start.x,
+                                                         bin_size.x,
+                                                         roi_bin_grid.x,
+                                                         region_end.x,
+                                                         region_start.y,
+                                                         bin_size.y,
+                                                         roi_bin_grid.y,
+                                                         region_end.y, pz);
+    }
+}
+#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/common/roi_pooling_layer.cl b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
new file mode 100644
index 0000000000..6899b952e0
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "helpers_asymm.h"
+
+#if DATA_SIZE == 32
+#define VEC_SIZE 4
+#define VEC_MAX vec4_max
+#elif DATA_SIZE == 16
+#define VEC_SIZE 8
+#define VEC_MAX vec8_max
+#elif DATA_SIZE == 8
+#define VEC_SIZE 16
+#define VEC_MAX vec16_max
+#else /* DATA_SIZE not equals 8, 16, 32 */
+#error "Unsupported data size"
+#endif /* DATA_SIZE == 32 */
+
+// Define whether to use max (Quantized datatype) or fmax (Float) functions
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+#define MAX(x, y) max(x, y)
+#else // !(defined(OFFSET_OUT) && defined(SCALE_OUT)
+#define MAX(x, y) fmax(x, y)
+#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
+
+inline DATA_TYPE vec4_max(VEC_DATA_TYPE(DATA_TYPE, 4) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    temp = MAX(vec.lo, vec.hi);
+    return MAX(temp.x, temp.y);
+}
+
+inline DATA_TYPE vec8_max(VEC_DATA_TYPE(DATA_TYPE, 8) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp = MAX(vec.lo, vec.hi);
+    return vec4_max(temp);
+}
+
+inline DATA_TYPE vec16_max(VEC_DATA_TYPE(DATA_TYPE, 16) vec)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp = MAX(vec.lo, vec.hi);
+    return vec8_max(temp);
+}
+
+/** Performs a roi pooling on a single output pixel.
+ *
+ * @param[in] input          Pointer to input Tensor3D struct.
+ * @param[in] region_start_x Start x index projected onto the input tensor.
+ * @param[in] region_end_x   End x index projected onto the input tensor.
+ * @param[in] region_start_y Start y index projected onto the input tensor.
+ * @param[in] region_end_y   End y index projected onto the input tensor.
+ * @param[in] pz             z index of the input tensor.
+ *
+ * @return A max pooled value from the region specified in the input tensor.
+ */
+inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int region_end_x, int region_start_y, int region_end_y, int pz)
+{
+    // Iterate through the pooling region
+    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    {
+        return (DATA_TYPE)0;
+    }
+    else
+    {
+        int num_iter = (int)((region_end_x - region_start_x) / VEC_SIZE);
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(MIN_VALUE);
+
+        for(int j = region_start_y; j < region_end_y; ++j)
+        {
+            int i = region_start_x;
+            for(; i < region_start_x + num_iter * VEC_SIZE; i += VEC_SIZE)
+            {
+                VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+                val      = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(input, i, j, pz));
+                curr_max = MAX(val, curr_max);
+            }
+            for(; i < region_end_x; ++i)
+            {
+                DATA_TYPE val = *(__global DATA_TYPE *)tensor3D_offset(input, i, j, pz);
+                curr_max      = MAX(curr_max, val);
+            }
+        }
+
+        const DATA_TYPE temp = (DATA_TYPE)VEC_MAX(curr_max);
+
+#if defined(OFFSET_OUT) && defined(SCALE_OUT)
+        return QUANTIZE(temp, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
+#endif /* if quantized, requantize and return */
+
+        return temp;
+    }
+}
+
+/** Performs a roi pooling function.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32, QASYMM8;
+ * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
+ * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
+ * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
+ * @note Spatial scale must be passed using -DSPATIAL_SCALE;
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32, QASYMM8
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source image as specifed by ROI
+ * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
+ * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
+ * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
+ * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as input
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_stride_w                       Stride of the source image in W dimension (in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination image in W dimension (in bytes)
+ */
+__kernel void roi_pooling_layer(
+    TENSOR3D_DECLARATION(input),
+    IMAGE_DECLARATION(rois),
+    TENSOR3D_DECLARATION(output),
+    unsigned int input_stride_w, unsigned int output_stride_w)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
+    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+    const int px = get_global_id(0);
+    const int py = get_global_id(1);
+    const int pw = get_global_id(2);
+
+    // Load roi parameters
+    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
+    const ushort roi_batch = (ushort) * ((__global ushort *)offset(&rois, 0, pw));
+    const VEC_DATA_TYPE(ushort, 4)
+    roi               = vload4(0, (__global ushort *)offset(&rois, 1, pw));
+    const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
+    const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
+
+    // Calculate pooled region start and end
+    const float2 spatial_indx     = (float2)(px, py);
+    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
+    const int2   max_spatial_dims = (int2)(MAX_DIM_X, MAX_DIM_Y);
+    int2         region_start     = convert_int2_sat(floor(spatial_indx / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+    int2         region_end       = convert_int2_sat(floor((spatial_indx + 1) / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
+
+    region_start = clamp(region_start, 0, max_spatial_dims);
+    region_end   = clamp(region_end, 0, max_spatial_dims);
+
+    // Move input and output pointer across the fourth dimension
+    input.ptr += roi_batch * input_stride_w;
+    output.ptr += pw * output_stride_w;
+
+    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
+    {
+        *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_pool_1x1(&input,
+                                                                                                       region_start.x,
+                                                                                                       region_end.x,
+                                                                                                       region_start.y,
+                                                                                                       region_end.y, pz);
+    }
+}
diff --git a/src/core/CL/cl_kernels/common/select.cl b/src/core/CL/cl_kernels/common/select.cl
new file mode 100644
index 0000000000..6fd4bd4ce3
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/select.cl
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** This function perform a select operation between two tensors when condition tensor has the same rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  c_step_y                          c_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  c_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  c_step_z                          c_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_same_rank(
+    TENSOR3D_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes + offset + get_global_id(1) * c_step_y + get_global_id(2) * c_step_z;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+
+    // Load values
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_c = CONVERT(VLOAD(VEC_SIZE)(0, c_addr), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
+
+    // Calculate result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_2(
+    VECTOR_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    const int c_idx = get_global_id(1);
+
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+
+    // Load values
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_c = *((__global uchar *)(c_addr + c_idx * c_stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
+
+    // Calculate result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) */
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) && defined(VEC_SIZE_LEFTOVER)
+/** This function perform a select operation between two tensors when condition tensor has a different rank.
+ *
+ * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
+ *
+ * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
+ * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
+ * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
+ * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
+ * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void select_different_rank_n(
+    VECTOR_DECLARATION(c),
+    TENSOR3D_DECLARATION(x),
+    TENSOR3D_DECLARATION(y),
+    TENSOR3D_DECLARATION(out))
+{
+    const int c_idx = get_global_id(2) / DEPTH_SIZE;
+
+    // Get pointers
+    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes;
+    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
+    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
+    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
+
+    // Load values
+    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_c = *((__global uchar *)(c_addr + c_idx * c_stride_x));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
+
+    // Calculate result
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
+
+    // Boundary-aware store
+    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) && defined(VEC_SIZE_LEFTOVER) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/slice_ops.cl b/src/core/CL/cl_kernels/common/slice_ops.cl
new file mode 100644
index 0000000000..d12c60f5ea
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/slice_ops.cl
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform a strided slice operation on a given input.
+ *
+ * @attention Supported tensor rank: up to 4
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Input and output tensor dephts should be given as a preprocessor arguments using -DSRC_DEPTH=size. and -DDST_DEPTH=size
+ * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
+ * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void strided_slice(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+
+    int offset = 0;
+
+    // Offset X
+#if defined(SHRINK_0)
+    input.ptr += (int)START_0 * input_stride_x;
+#elif defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    offset       = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
+    input.ptr += offset * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+#elif defined(START_0) && defined(STRIDE_0)
+    offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
+    input.ptr += offset * input_stride_x;
+#endif // defined(START_0) && defined(STRIDE_0)
+
+    // Offset Y
+#if defined(SHRINK_1)
+    input.ptr += (int)START_1 * input_stride_y;
+#elif defined(START_1) && defined(STRIDE_1)
+#if defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(0) * (int)STRIDE_1;
+#else  // defined(SHRINK_0)
+    offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
+#endif // defined(SHRINK_0)
+    input.ptr += offset * input_stride_y;
+#endif // defined(START_1) && defined(STRIDE_1)
+
+    // Offset Z
+#if defined(SHRINK_2)
+    input.ptr += (int)START_2 * input_stride_z;
+#elif defined(START_2) && defined(STRIDE_2)
+
+#if defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(0) * (int)STRIDE_2;
+#elif defined(SHRINK_1) || defined(SHRINK_0)
+    offset = (int)START_2 + (int)get_global_id(1) * (int)STRIDE_2;
+#else  // defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
+#endif // defined(SHRINK_1) && defined(SHRINK_0)
+
+    input.ptr += offset * input_stride_z;
+#endif // defined(START_2) && defined(STRIDE_2)
+
+    // Offset depth
+#if defined(SHRINK_3)
+    input.ptr += (int)START_3 * input_stride_w;
+#elif defined(START_3) && defined(STRIDE_3)
+#if defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + (int)get_global_id(0) * (int)STRIDE_3;
+#elif !defined(SHRINK_2) && !defined(SHRINK_1) && !defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
+#elif(defined(SHRINK_0) && defined(SHRINK_1)) || (defined(SHRINK_1) && defined(SHRINK_2)) || (defined(SHRINK_0) && defined(SHRINK_2))
+    offset = (int)START_3 + (int)get_global_id(1) * (int)STRIDE_3;
+#else  // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    offset = (int)START_3 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_3;
+#endif // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
+    input.ptr += offset * input_stride_w;
+#endif // defined(START_3) && defined(STRIDE_3)
+
+    // Store result
+#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
+
+    VSTORE(VEC_SIZE)
+    (val, 0, (__global DATA_TYPE *)(output.ptr));
+#else  // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
+}
diff --git a/src/core/CL/cl_kernels/common/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
new file mode 100644
index 0000000000..4d2d89dd73
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst))
+{
+    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
+    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+    data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+#if defined(LOG_SOFTMAX)
+    sum_val = log(sum_val);
+    data0 -= sum_val;
+#else  // defined(LOG_SOFTMAX)
+    data0 /= sum_val;
+#endif // defined(LOG_SOFTMAX)
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ */
+__kernel void softmax_layer_max_shift_exp_sum_serial(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum))
+{
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_TYPE beta = (VEC_TYPE)BETA;
+#endif /* BETA */
+
+    // Initialize local maximum
+    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_TYPE data    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
+    max_val_vec      = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    {
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+        max_val_vec   = max(data, max_val_vec);
+    }
+
+    // Perform max reduction
+    DATA_TYPE max_val                 = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+    *((__global DATA_TYPE *)maxo.ptr) = max_val;
+
+    /* Second section */
+
+    // Set sum vector
+    VEC_TYPE sum1D = 0;
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    data -= max_val;
+#ifdef BETA
+    data *= beta;
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+    (data, 0, (__global DATA_TYPE *)dst_addr);
+    data = exp(data);
+    data = select(0, data, widx);
+#else  /* LOG_SOFTMAX */
+    data = exp(data);
+    data = select(0, data, widx);
+    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+    (data, 0, (__global DATA_TYPE *)dst_addr);
+#endif /* LOG_SOFTMAX */
+    sum1D += data;
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Shift values, exp and sum
+    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    {
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+        data -= max_val;
+#ifdef BETA
+        data *= beta;
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
+        data = exp(data);
+#else  /* LOG_SOFTMAX */
+        data = exp(data);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
+#endif /* LOG_SOFTMAX */
+        sum1D += data;
+    }
+
+    // Perform sum reduction
+    *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+}
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
+ * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
+ * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ */
+__kernel void softmax_layer_max_shift_exp_sum_parallel(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum))
+{
+    const uint lid    = get_local_id(0);
+    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+#ifdef BETA
+    // Initialize beta
+    VEC_TYPE beta = (VEC_TYPE)BETA;
+#endif /* BETA */
+
+    // Define one temporary vector per work-item.
+    __local VEC_TYPE tmp_local[GRID_SIZE];
+    __local DATA_TYPE max_local;
+
+    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
+
+    // Number of iterations per work-item.
+    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
+    // Calculate max of row
+    uint i = 0;
+    for(; i < width; ++i)
+    {
+        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        max_val_vec       = max(data_max, max_val_vec);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    // How many work-items needed to complete the computation.
+    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
+    if(lid < boundary_workitems)
+    {
+        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        max_val_vec       = max(data_max, max_val_vec);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    SELECT_TYPE widx;
+    if(lid == 0)
+    {
+        // Handle non multiple of 4
+        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        widx              = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
+        max_val_vec       = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = max_val_vec;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
+        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Second section */
+
+    // Set sum vector
+    VEC_TYPE  sum1D   = 0;
+    DATA_TYPE max_val = max_local;
+
+    // Shift values, exp and sum
+    for(i = 0; i < width; ++i)
+    {
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        data -= max_val;
+#ifdef BETA
+        data *= beta;
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        data = exp(data);
+#else  /* LOG_SOFTMAX */
+        data = exp(data);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+#endif /* LOG_SOFTMAX */
+        sum1D += data;
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
+    if(lid < boundary_workitems)
+    {
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        data -= max_val;
+#ifdef BETA
+        data *= beta;
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        data = exp(data);
+#else  /* LOG_SOFTMAX */
+        data = exp(data);
+        VSTORE(VECTOR_SIZE)
+        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+#endif /* LOG_SOFTMAX */
+        sum1D += data;
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(lid == 0)
+    {
+        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
+        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        data -= max_val;
+#ifdef BETA
+        data *= beta;
+#endif /* BETA */
+#ifdef LOG_SOFTMAX
+        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        data = exp(data);
+        data = select(0, data, widx);
+#else  /* LOG_SOFTMAX */
+        data = exp(data);
+        data = select(0, data, widx);
+        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+#endif /* LOG_SOFTMAX */
+        sum1D += data;
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = sum1D;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] += tmp_local[lid + 128];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] += tmp_local[lid + 64];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] += tmp_local[lid + 32];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] += tmp_local[lid + 16];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] += tmp_local[lid + 8];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] += tmp_local[lid + 4];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] += tmp_local[lid + 2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
+        // Perform sum reduction
+        *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+    }
+}
+
+#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
+#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
new file mode 100644
index 0000000000..4d5006d804
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
+
+#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
+
+/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void softmax_layer_norm_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(sum),
+    TENSOR3D_DECLARATION(dst))
+{
+    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
+
+    // Load max value of 1D logits vector (row)
+    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
+
+    // It will be better to calculate this in prev layer and pass here as parameter
+    uint    sum_val_u               = convert_uint(sum_val);
+    int     headroom_plus_one       = clz(sum_val_u);
+    int     num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
+    int     shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
+    VEC_INT shifted_sum_minus_one   = shifted_sum_minus_one_1;
+    VEC_INT shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, VECTOR_SIZE);
+
+    // It was already calculated in prev layer, should be stored into tmp output and reused
+    VEC_INT data_diff      = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
+    VEC_INT data_diff_mult = data_diff;
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+
+    VEC_INT data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+    data         = ASYMM_MULT(shifted_scale, data, VECTOR_SIZE);
+    data         = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, VECTOR_SIZE);
+#ifdef QASYMM8_SIGNED
+    data += (VEC_INT)(MIN_VALUE);
+#endif /* QASYMM8_SIGNED */
+    data           = select(MIN_VALUE, data, data_diff >= (VEC_INT)(DIFF_MIN));
+    VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
+
+    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+
+#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
+
+/* Number of workitems in dimension 0. */
+#if !defined(GRID_SIZE)
+#define GRID_SIZE 1
+#endif /* !defined(GRID_SIZE) */
+
+#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
+
+VEC_INT mult_by_quantized_multiplier(VEC_INT data)
+{
+#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
+    if(INPUT_BETA_MULTIPLIER > 1)
+    {
+        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
+    }
+#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
+    return data;
+}
+
+/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
+ * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
+ */
+__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum))
+{
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
+
+    // Calculate max of row
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
+    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    VEC_INT widx         = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
+    max_val_vec          = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    {
+        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+        max_val_vec   = max(data, max_val_vec);
+    }
+
+    // Perform max reduction
+    DATA_TYPE max_local               = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+    *((__global DATA_TYPE *)maxo.ptr) = max_local;
+
+    // Second part
+
+    // Load max value of 1D logits vector (row)
+    int max_val = convert_int(max_local);
+
+    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
+    VEC_INT sum1D = 0;
+
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_INT data_fp        = CONVERT(data, VEC_INT);
+    VEC_INT data_diff      = data_fp - max_val;
+    VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+    data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+    data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+    (data_diff, 0, (__global int *)dst_addr);
+    data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    sum1D += select(0, data_fp, widx);
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+
+    // Shift values, exp and sum
+    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
+    {
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    }
+
+    // Perform sum reduction
+    *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+}
+
+/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
+ * then gets the exponent of each element as sums all elements across each row.
+ *
+ * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
+ * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
+ * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
+ * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
+ * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
+ * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
+ * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
+ * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
+ * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
+ *
+ * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
+ * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
+ * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
+ * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
+ * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
+ * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
+ * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
+ * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
+ */
+__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(maxo),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(sum))
+{
+    const uint lid    = get_local_id(0);
+    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
+
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+
+    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
+    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
+
+    // Define one temporary vector per work-item.
+    __local VEC_INT tmp_local[GRID_SIZE];
+    __local DATA_TYPE max_local;
+
+    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
+    VEC_BASE max_val_vec = vec_min_val;
+
+    // Number of iterations per work-item.
+    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
+    // Calculate max of row
+    uint i = 0;
+    for(; i < width; ++i)
+    {
+        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        max_val_vec       = max(data_max, max_val_vec);
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    // How many work-items needed to complete the computation.
+    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
+    if(lid < boundary_workitems)
+    {
+        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        max_val_vec       = max(data_max, max_val_vec);
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    VEC_INT widx;
+    if(lid == 0)
+    {
+        // Handle non multiple of 4
+        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        widx              = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
+        max_val_vec       = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
+        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    /* Second section */
+
+    // Set sum vector
+    VEC_INT sum1D   = 0;
+    int     max_val = convert_int(max_local);
+
+    // Shift values, exp and sum
+    for(i = 0; i < width; ++i)
+    {
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    }
+#ifdef NON_MULTIPLE_OF_GRID_SIZE
+    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
+    if(lid < boundary_workitems)
+    {
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+        VSTORE(VECTOR_SIZE)
+        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
+        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+    }
+#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
+    if(lid == 0)
+    {
+        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
+        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
+        VEC_INT data_fp        = CONVERT(data, VEC_INT);
+        VEC_INT data_diff      = data_fp - max_val;
+        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
+        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
+        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
+        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
+        (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
+        data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
+        data_fp = select(0, data_fp, widx);
+        sum1D   = sum1D + data_fp;
+    }
+#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
+#endif /* NON_MULTIPLE_OF_GRID_SIZE */
+    tmp_local[lid] = sum1D;
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    if(GRID_SIZE >= 256)
+    {
+        if(lid < 128)
+        {
+            tmp_local[lid] += tmp_local[lid + 128];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 128)
+    {
+        if(lid < 64)
+        {
+            tmp_local[lid] += tmp_local[lid + 64];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 64)
+    {
+        if(lid < 32)
+        {
+            tmp_local[lid] += tmp_local[lid + 32];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 32)
+    {
+        if(lid < 16)
+        {
+            tmp_local[lid] += tmp_local[lid + 16];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 16)
+    {
+        if(lid < 8)
+        {
+            tmp_local[lid] += tmp_local[lid + 8];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 8)
+    {
+        if(lid < 4)
+        {
+            tmp_local[lid] += tmp_local[lid + 4];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(GRID_SIZE >= 4)
+    {
+        if(lid < 2)
+        {
+            tmp_local[lid] += tmp_local[lid + 2];
+        }
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+    if(lid == 0)
+    {
+        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
+        // Perform sum reduction
+        *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
+    }
+}
+#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
+#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
diff --git a/src/core/CL/cl_kernels/common/stack_layer.cl b/src/core/CL/cl_kernels/common/stack_layer.cl
new file mode 100644
index 0000000000..2468bf750d
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/stack_layer.cl
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
+
+#if AXIS == 0
+#define X_DST (idx_input)
+#define Y_DST (x_src)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 1 // AXIS == 1
+#define X_DST (x_src)
+#define Y_DST (idx_input)
+#define Z_DST (y_src)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 2 // AXIS == 2
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (idx_input)
+#define W_DST (z_src)
+#define K_DST (w_src)
+#elif AXIS == 3 // AXIS == 3
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (idx_input)
+#define K_DST (w_src)
+#elif AXIS == 4 // AXIS == 4
+#define X_DST (x_src)
+#define Y_DST (y_src)
+#define Z_DST (z_src)
+#define W_DST (w_src)
+#define K_DST (idx_input)
+#else // AXIS not supported
+#error "Not supported axis"
+#endif // AXIS == 0
+
+/** OpenCL kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension
+ *
+ * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
+ * @note The dimension to stack the tensors along has to be passed at compile time using -DAXIS. i.e. -DAXIS=1
+ * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
+ * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx_input                         Index of the input tensor in the list of tensors to stack
+ */
+__kernel void stack_layer(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+    unsigned int idx_input)
+{
+    uint x_src = get_global_id(0);
+    uint y_src = get_global_id(1);
+    uint z_src = (get_global_id(2) % SRC_DIM2);
+    uint w_src = (get_global_id(2) / SRC_DIM2);
+
+    __global DATA_TYPE *src = (__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_src * sizeof(DATA_TYPE) + y_src * src_stride_y + z_src * src_stride_z + w_src * src_stride_w);
+
+    __global DATA_TYPE *dst = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + X_DST * sizeof(DATA_TYPE) + Y_DST * dst_stride_y + Z_DST * dst_stride_z + W_DST * dst_stride_w + K_DST *
+                                                     dst_stride_w * (uint)DST_DIM3);
+
+    *dst = *src;
+}
+
+#undef X_DST
+#undef Y_DST
+#undef Z_DST
+#undef W_DST
+#endif // defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
diff --git a/src/core/CL/cl_kernels/common/tile.cl b/src/core/CL/cl_kernels/common/tile.cl
new file mode 100644
index 0000000000..4332411688
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/tile.cl
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
+/** Perform a floor operation on an input tensor.
+ *
+ * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Can only take floating point data types.
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void tile(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
+    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
+
+    // For all coordinates but x, each tile copies from the input
+    const int y     = get_global_id(1);
+    const int z     = get_global_id(2) % DST_DEPTH;
+    const int batch = get_global_id(2) / DST_DEPTH;
+
+#if defined(VEC_SIZE) && defined(OFFSET)
+    // If we are loading/storing multiple elements at time, we need to
+    // not exceed the input boundaries. The last threads need to backtrack
+    // of OFFSET elements. Those elements cumulates for previous tiles
+    const int id = (int)(get_global_id(0));
+    int       x  = id * VEC_SIZE;
+
+    // Shift x based on the previous offsets
+    const int tile_number = x / SRC_WIDTH;
+    x -= (tile_number) * OFFSET;
+    int x_input = x % SRC_WIDTH;
+
+    // Shift x based on being the last tile
+    const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
+    x -= last_tile * OFFSET;
+    x_input = x % SRC_WIDTH;
+    output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
+
+    // Update the input pointer
+    input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+
+    // Copy the data
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
+
+    VSTORE(VEC_SIZE)
+    (data, 0, (__global DATA_TYPE *)output.ptr);
+#else  // !defined(VEC_SIZE) || !defined(OFFSET)
+    const int x = get_global_id(0);
+
+    // Update the input pointer
+    input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
+
+    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
+#endif // defined(VEC_SIZE) && defined(OFFSET)
+}
+#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
diff --git a/src/core/CL/cl_kernels/common/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl
new file mode 100644
index 0000000000..82db2908b5
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/transpose.cl
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#define PARTIAL_STORE_M0 VEC_SIZE_LEFTOVER_X
+#define PARTIAL_STORE_N0 VEC_SIZE_LEFTOVER_Y
+
+#include "helpers.h"
+#include "repeat.h"
+
+#if defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
+
+#if VEC_SIZE_X == 1
+#if VEC_SIZE_Y == 1
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0                \
+    }
+#elif VEC_SIZE_Y == 2
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1            \
+    }
+#elif VEC_SIZE_Y == 3
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1, u2        \
+    }
+#elif VEC_SIZE_Y == 4
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0, u1, u2, u3    \
+    }
+#elif VEC_SIZE_Y == 8
+#define TRANSPOSED_U(val)              \
+    {                                  \
+        u0, u1, u2, u3, u4, u5, u6, u7 \
+    }
+#elif VEC_SIZE_Y == 16
+#define TRANSPOSED_U(val)                        \
+    {                                            \
+        u0, u1, u2, u3, u4, u5, u6, u7,          \
+        u8, u9, u10, u11, u12, u13, u14, u15 \
+    }
+#endif /* switch VEC_SIZE_Y */
+#else  // VEC_SIZE_X == 1
+#if VEC_SIZE_Y == 1
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0.val            \
+    }
+#elif VEC_SIZE_Y == 2
+#define TRANSPOSED_U(val) \
+    {                     \
+        u0.val, u1.val    \
+    }
+#elif VEC_SIZE_Y == 3
+#define TRANSPOSED_U(val)      \
+    {                          \
+        u0.val, u1.val, u2.val \
+    }
+#elif VEC_SIZE_Y == 4
+#define TRANSPOSED_U(val)              \
+    {                                  \
+        u0.val, u1.val, u2.val, u3.val \
+    }
+#elif VEC_SIZE_Y == 8
+#define TRANSPOSED_U(val)                                              \
+    {                                                                  \
+        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val \
+    }
+#elif VEC_SIZE_Y == 16
+#define TRANSPOSED_U(val)                                                        \
+    {                                                                            \
+        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val,          \
+        u8.val, u9.val, u10.val, u11.val, u12.val, u13.val, u14.val, u15.val \
+    }
+#endif /* switch VEC_SIZE_Y */
+#endif // VEC_SIZE_X == 1
+
+#if DATA_TYPE_IN_BYTES == 4
+#define DATA_TYPE uint
+#elif DATA_TYPE_IN_BYTES == 2
+#define DATA_TYPE ushort
+#elif DATA_TYPE_IN_BYTES == 1
+#define DATA_TYPE uchar
+#else /* switch DATA_TYPE_IN_BYTES */
+#error DATA_TYPE_IN_BYTES not supported for transpose
+#endif /* switch DATA_TYPE_IN_BYTES */
+
+/** This OpenCL kernel computes the matrix transposition of input matrix
+ *
+ * @note The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
+ *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
+ *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
+ *  -# -DVEC_SIZE_X is the number of elements processed in X dimension
+ *  -# -DVEC_SIZE_LEFTOVER_X is the leftover size in the X dimension; x_dimension % VEC_SIZE_X
+ *  -# -DVEC_SIZE_Y is the number of elements processed in Y dimension
+ *  -# -DVEC_SIZE_LEFTOVER_Y is the leftover size in the Y dimension; y_dimension % VEC_SIZE_Y
+ *
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
+ */
+__kernel void transpose(IMAGE_DECLARATION(src),
+                        IMAGE_DECLARATION(dst))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
+    uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0);
+
+    // Compute addresses
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y;
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y;
+
+    // Load the NxM block at (x, y)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u0 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)src_addr);
+#if VEC_SIZE_Y > 1
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u1 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + src_stride_y));
+#endif /* VEC_SIZE_Y > 1 */
+#if VEC_SIZE_Y > 2
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u2 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif /* VEC_SIZE_Y > 2 */
+#if VEC_SIZE_Y > 3
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u3 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif /* VEC_SIZE_Y > 3 */
+#if VEC_SIZE_Y > 4
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u4 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u5 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u6 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u7 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif /* VEC_SIZE_Y > 4 */
+#if VEC_SIZE_Y > 8
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u8 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u9 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 9 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u10 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 10 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u11 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 11 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u12 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 12 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u13 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 13 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u14 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 14 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
+    u15 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 15 * src_stride_y));
+#endif /* VEC_SIZE_Y > 8 */
+
+    //Create transposed vectors
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t0 = TRANSPOSED_U(s0);
+#if VEC_SIZE_X > 1
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t1 = TRANSPOSED_U(s1);
+#endif /* VEC_SIZE_X > 1 */
+#if VEC_SIZE_X > 2
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t2 = TRANSPOSED_U(s2);
+#endif /* VEC_SIZE_X > 2 */
+#if VEC_SIZE_X > 3
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t3 = TRANSPOSED_U(s3);
+#endif /* VEC_SIZE_X > 3 */
+#if VEC_SIZE_X > 4
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t4 = TRANSPOSED_U(s4);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t5 = TRANSPOSED_U(s5);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t6 = TRANSPOSED_U(s6);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t7 = TRANSPOSED_U(s7);
+#endif /* VEC_SIZE_X > 4 */
+#if VEC_SIZE_X > 8
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t8 = TRANSPOSED_U(s8);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    t9 = TRANSPOSED_U(s9);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tA = TRANSPOSED_U(sA);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tB = TRANSPOSED_U(sB);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tC = TRANSPOSED_U(sC);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tD = TRANSPOSED_U(sD);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tE = TRANSPOSED_U(sE);
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
+    tF = TRANSPOSED_U(sF);
+#endif /* VEC_SIZE_X > 8 */
+
+    // Store the block at (y, x)
+    REPEAT_VAR_INIT_TO_CONST(VEC_SIZE_X, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
+    STORE_BLOCK_BOUNDARY_AWARE(VEC_SIZE_X, VEC_SIZE_Y, DATA_TYPE, t, (__global uchar *)dst_addr, dst_stride_y, zout, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_Y, VEC_SIZE_LEFTOVER_X != 0
+                               && get_global_id(0) == 0,
+                               VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0);
+}
+
+#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/common/unpooling_layer.cl b/src/core/CL/cl_kernels/common/unpooling_layer.cl
new file mode 100644
index 0000000000..6662dc9360
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/unpooling_layer.cl
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+/** Performs max unpooling function with pool size equal to 2.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
+ * @note The width of the output tensor must be passed using -DWIDTH_DST e.g. -DWIDTH_DST=24
+ * @note The height of the output tensor must be passed using -DHEIGHT_DST e.g. -DHEIGHT_DST=54
+ * @note The depth of the output tensor must be passed using -DDEPTH_DST e.g. -DDEPTH_DST=32
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the output tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the output tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the output tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the output tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the output tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void max_unpooling_layer_2(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    unsigned int index = *((__global unsigned int *)indices.ptr);
+    DATA_TYPE value    = *((__global DATA_TYPE *)input.ptr);
+
+    *((__global DATA_TYPE *)tensor3D_index2ptr(&output, WIDTH_DST, HEIGHT_DST, DEPTH_DST, index)) = value;
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/comparisons.cl
deleted file mode 100644
index 408846144d..0000000000
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define EQUAL(x, y) ((x) == (y))
-#define NOTEQUAL(x, y) ((x) != (y))
-#define GREATER(x, y) ((x) > (y))
-#define GREATEREQUAL(x, y) ((x) >= (y))
-#define LESS(x, y) ((x) < (y))
-#define LESSEQUAL(x, y) ((x) <= (y))
-
-#define DEFINE_KERNEL_STR(name) compare_##name
-#define DEFINE_KERNEL(name) DEFINE_KERNEL_STR(name)
-
-#define DEFINE_KERNEL_QUANTIZED_STR(name) compare_##name##_quantized
-#define DEFINE_KERNEL_QUANTIZED(name) DEFINE_KERNEL_QUANTIZED_STR(name)
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME)
-/** This function compares two tensors.
- *
- * @attention The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention The comparison operation should be given as a preprocessor argument using -DOP=operation. e.g. -DOP=LESS
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All non-quantized data types.
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void DEFINE_KERNEL(OP_NAME)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_a = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_b = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2.ptr);
-
-    // Calculate and store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(OP(in_a, in_b), VEC_DATA_TYPE(uchar, VEC_SIZE)), 0, (__global uchar *)out.ptr);
-}
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OP) && defined(OP_NAME) */
-
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2)
-/** This function compares two quantized tensors.
- *
- * @note The inputs' data type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: All quantized data types.
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void DEFINE_KERNEL_QUANTIZED(OP_NAME)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    int16 in_a = CONVERT(vload16(0, (__global DATA_TYPE *)in1.ptr), int16);
-    int16 in_b = CONVERT(vload16(0, (__global DATA_TYPE *)in2.ptr), int16);
-
-    in_a = in_a - (int16)((int)OFFSET_IN1);
-    in_b = in_b - (int16)((int)OFFSET_IN2);
-
-    const float16 in1f32 = convert_float16(in_a) * (float16)((float)SCALE_IN1);
-    const float16 in2f32 = convert_float16(in_b) * (float16)((float)SCALE_IN2);
-    const int16   res    = OP(in1f32, in2f32);
-
-    // Store result
-    vstore16(convert_uchar16(res), 0, (__global uchar *)out.ptr);
-}
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(SCALE_IN1) && defined(SCALE_IN2) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl
deleted file mode 100644
index d2e65408dc..0000000000
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ /dev/null
@@ -1,415 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-inline VEC_QUANT requantize(VEC_QUANT input, float in_offset, float out_offset, float in_scale, float out_scale)
-{
-    const VEC_FLOAT in_f32  = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale);
-    const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset));
-    const VEC_QUANT res_q8  = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_QUANT);
-    return res_q8;
-}
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-#if defined(DATA_TYPE)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#if defined(DEPTH) && defined(ELEMENT_SIZE)
-#if defined(INPUT1_WIDTH)
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define SEQ VEC_OFFS(int, VEC_SIZE)
-
-/** This kernel concatenates two input tensors into the output tensor along the first dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
- *
- * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All.
- * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
- * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
-__kernel void concatenate_width_x2(
-    TENSOR4D_DECLARATION(src1),
-    TENSOR4D_DECLARATION(src2),
-    TENSOR4D_DECLARATION(dst))
-{
-    // Calculate input indices
-    const int x  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y  = get_global_id(1);
-    const int z  = get_global_id(2) % (int)DEPTH;
-    const int w  = get_global_id(2) / (int)DEPTH;
-    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
-    const int x2 = max(x - (int)INPUT1_WIDTH, 0);
-
-    // Calculate inputs and output addresses
-    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
-    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
-    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
-
-    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
-    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
-    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1)  && defined(SCALE_IN2) && defined(SCALE_OUT) */
-    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
-
-    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
-    SELECT_TYPE cond = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
-    src1_values      = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
-    src2_values      = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond);
-
-    cond                   = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
-    const VEC_TYPE values0 = select(src2_values, src1_values, cond);
-
-    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH)
-/** This kernel concatenates four input tensors into the output tensor along the first dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- * @note First input tensor width should be given as a preprocessor argument using -DINPUT1_WIDTH=width. e.g. -DINPUT1_WIDTH=8
- * @note Second input tensor width should be given as a preprocessor argument using -DINPUT2_WIDTH=width. e.g. -DINPUT2_WIDTH=8
- * @note Third input tensor width should be given as a preprocessor argument using -DINPUT3_WIDTH=width. e.g. -DINPUT3_WIDTH=8
- *
- * @param[in]  src1_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src1_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  src2_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
- * @param[in]  src2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src2_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src2_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src2_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  src3_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
- * @param[in]  src3_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src3_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src3_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src3_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src3_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src3_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src3_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src3_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src3_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  src4_ptr                           Pointer to the source tensor. Supported data types: same as @p src1_ptr
- * @param[in]  src4_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src4_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src4_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src4_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src4_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src4_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src4_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src4_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src4_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src1_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
-__kernel void concatenate_width_x4(
-    TENSOR4D_DECLARATION(src1),
-    TENSOR4D_DECLARATION(src2),
-    TENSOR4D_DECLARATION(src3),
-    TENSOR4D_DECLARATION(src4),
-    TENSOR4D_DECLARATION(dst))
-{
-    // Calculate input indices
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % (int)DEPTH;
-    const int w = get_global_id(2) / (int)DEPTH;
-
-    const int x1 = min(x, (int)INPUT1_WIDTH - (int)VEC_SIZE);
-    const int x2 = min(max(x - (int)INPUT1_WIDTH, 0), (int)INPUT2_WIDTH - (int)VEC_SIZE);
-    const int x3 = min(max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH, 0), (int)INPUT3_WIDTH - (int)VEC_SIZE);
-    const int x4 = max(x - (int)INPUT1_WIDTH - (int)INPUT2_WIDTH - (int)INPUT3_WIDTH, 0);
-
-    // Calculate inputs and output addresses
-    const __global uchar *dst_addr  = dst_ptr + (int)dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (int)dst_stride_z + w * (int)dst_stride_w;
-    const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w;
-    const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w;
-    const __global uchar *src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w;
-    const __global uchar *src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w;
-
-    VEC_TYPE src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
-    VEC_TYPE src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
-    VEC_TYPE src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
-    VEC_TYPE src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
-    src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
-    src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
-    src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */
-
-    const VEC_INT x_coords = SEQ + (VEC_INT)(x);
-
-    SELECT_TYPE cond_in2 = CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH - VEC_SIZE)), SELECT_TYPE);
-    SELECT_TYPE cond_in3 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH - VEC_SIZE)), SELECT_TYPE);
-    SELECT_TYPE cond_in4 = CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH - VEC_SIZE)), SELECT_TYPE);
-
-    // Rotate src1/2_values, if values0 is a combination of src1_values and src2_values.
-    src1_values = select(src1_values, ROTATE(src1_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
-    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT1_ROTATE_N), cond_in2);
-    // Rotate src2/3_values, if values0 is a combination of src2_values and src3_values.
-    src2_values = select(src2_values, ROTATE(src2_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
-    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT2_ROTATE_N), cond_in3);
-    // Rotate src3/4_values, if values0 is a combination of src3_values and src4_values.
-    src3_values = select(src3_values, ROTATE(src3_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
-    src4_values = select(src4_values, ROTATE(src4_values, VEC_SIZE, INPUT3_ROTATE_N), cond_in4);
-
-    cond_in2 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
-    cond_in3 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
-    cond_in4 = CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
-
-    VEC_TYPE values0 = select(src2_values, src1_values, cond_in2);
-    values0          = select(src3_values, values0, cond_in3);
-    values0          = select(src4_values, values0, cond_in4);
-
-    STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(INPUT2_WIDTH) && defined(INPUT3_WIDTH) */
-#endif /* defined(INPUT1_WIDTH) */
-#endif /* defined(DEPTH) && defined(ELEMENT_SIZE) */
-
-#if defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-/** This kernel concatenates the input tensor into the output tensor along the first dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The offset for the first spatial dimension has to be passed at compile time using -DWIDTH_OFFSET. i.e. -DWIDTH_OFFSET=128
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-
-__kernel void concatenate_width(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst))
-{
-    // Calculate input indices
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % (int)DEPTH;
-    const int w = get_global_id(2) / (int)DEPTH;
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z + w * src_stride_w;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + w * dst_stride_w;
-
-    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET * sizeof(DATA_TYPE), VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-}
-
-#endif /* defined(WIDTH_OFFSET) && defined(DEPTH) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)*/
-
-#if defined(VEC_SIZE_LEFTOVER)
-
-#if defined(HEIGHT_OFFSET) && defined(DEPTH) && defined(VEC_SIZE)
-/** This kernel concatenates the input tensor into the output tensor along the second dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Vector sizes supported are 2,4,8 and 16.
- * @note The offset for the second spatial dimension has to be passed at compile time using -DHEIGHT_OFFSET. i.e. -DHEIGHT_OFFSET=128
- * @note Tensor depth should be given as a preprocessor argument using -DDEPTH=size. e.g. -DDEPTH=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-
-__kernel void concatenate_height(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) % DEPTH) * src_stride_z + (get_global_id(
-                                   2) / DEPTH) * src_stride_w;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) % DEPTH) * dst_stride_z + (get_global_id(
-                                   2) / DEPTH) * dst_stride_w;
-
-    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-    STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#else  /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-}
-
-#endif /* defined(HEIGHT_OFFSET) && defined(DEPTH) */
-
-/** This kernel concatenates the input tensor into the output tensor along the third dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note Vector size has to be passed at compile time using -DVEC_SIZE. i.e. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  offsets                           The offsets to the first valid element of the output tensor in bytes
- */
-__kernel void concatenate(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    int offset)
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    VEC_TYPE source_values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-    STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + offset, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(VEC_SIZE_LEFTOVER) */
-#endif /* defined(DATA_TYPE) */
-#endif /* defined(VEC_SIZE) */
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/convert_fc_weights.cl
deleted file mode 100644
index a451c0213b..0000000000
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
-/** Perform a NCHW -> NHWC or NHWC -> NCHW conversion for Fully Connected 2D weights.
- *
- * For NCHW -> NHWC, FACTOR_1 will be equal to the product of the first two dimensions of FullyConnectedLayer's input and FACTOR_2 will represent the number of channels of that tensor.
- * For NHWC -> NCHW, FACTOR_1 and FACTOR_2 will hold the same values, but swapped.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Original input tensor width*height and depth should be given as a preprocessor argument using -DFACTOR_1=size and -DFACTOR_2=size for NCHW and vice versa for NHWC. e.g. -DFACTOR_1=256 and -DFACTOR_2=128
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void convert_fc_weights(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_x + (get_global_id(1) % FACTOR_1 * FACTOR_2 + get_global_id(1) / FACTOR_1) * dst_stride_y;
-
-    *((__global DATA_TYPE *)dst_addr) = *((__global DATA_TYPE *)src.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(FACTOR_1) && defined(FACTOR_2)
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/convolution_layer.cl
deleted file mode 100644
index cfd1f12328..0000000000
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2017-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(NUM_GROUPS)
-/** This kernel reshapes the tensor's low three dimensions to single column
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note The number of groups should be given as a preprocessor argument using -DNUM_GROUPS=number. e.g. -DNUM_GROUPS=2
- *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  bias_ptr                           Pointer to the bias tensor. Supported data types: F16/F32, for quantized types this must be nullptr
- * @param[in]  bias_stride_x                      Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  bias_step_x                        bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  width                              The width of the input tensor
- * @param[in]  height                             The height of the input tensor
- * @param[in]  depth                              The depth of the input tensor
- * @param[in]  total_filters                      Total number of filters. 4th dimension of the weights matrix
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- */
-__kernel void reshape_to_columns(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(bias),
-#endif /* HAS_BIAS */
-    uint width, uint height, uint depth, uint total_filters, uint dst_stride_z)
-{
-    Tensor3D src            = CONVERT_TO_TENSOR3D_STRUCT(src);
-    bool     is_last_thread = (get_global_id(0) == (get_global_size(0) - 1) && get_global_id(1) == (get_global_size(1) - 1) && get_global_id(2) == (get_global_size(2) - 1));
-
-    __global uchar *tmp_src_ptr = src.ptr;
-    __global uchar *tmp_dst_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(0) * dst_stride_y + get_global_id(1) * width * dst_stride_y + get_global_id(
-                                      2) * width * height * dst_stride_y;
-#ifdef HAS_BIAS
-    __global uchar *tmp_bias_ptr = bias_ptr + bias_offset_first_element_in_bytes;
-#endif /* HAS_BIAS */
-
-    if(is_last_thread)
-    {
-        for(uint g = 0; g < NUM_GROUPS; ++g)
-        {
-            __global uchar *curr_group_dst = tmp_dst_ptr;
-
-            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
-            {
-                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
-
-#ifdef HAS_BIAS
-                *((__global DATA_TYPE *)(curr_group_dst + dst_stride_y)) = *((__global DATA_TYPE *)(tmp_bias_ptr));
-                tmp_bias_ptr += bias_stride_x;
-#endif /* HAS_BIAS */
-                tmp_src_ptr += depth * src_stride_z;
-                curr_group_dst += dst_stride_x;
-            }
-
-            tmp_dst_ptr += dst_stride_z;
-        }
-    }
-    else
-    {
-        for(uint g = 0; g < NUM_GROUPS; ++g)
-        {
-            __global uchar *curr_group_dst = tmp_dst_ptr;
-
-            for(uint i = 0; i < total_filters / NUM_GROUPS; ++i)
-            {
-                *((__global DATA_TYPE *)curr_group_dst) = *((__global DATA_TYPE *)tmp_src_ptr);
-                tmp_src_ptr += depth * src_stride_z;
-                curr_group_dst += dst_stride_x;
-            }
-
-            tmp_dst_ptr += dst_stride_z;
-        }
-    }
-}
-#endif // defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
deleted file mode 100644
index 9c90969827..0000000000
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-/** Performs a copy of input tensor to the output tensor.
- *
- * @note The following variables must be passed at compile time:
- * -# -DDATA_TYPE        : Input and output datatypes.
- * -# -DVEC_SIZE         : The number of elements processed in X dimension
- * -# -DVEC_SIZE_LEFTOVER: Leftover size in the X dimension; x_dimension % VEC_SIZE
- *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void copy_tensor(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Boundary-aware access:
-    // If the there's left-over in width (VEC_SIZE_LEFTOVER > 0):
-    // Shift all accesses other than the first to avoid accessing out of bounds
-    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)VEC_SIZE_LEFTOVER, 0) % VEC_SIZE;
-    in.ptr -= shift * in.stride_x;
-    out.ptr -= shift * out.stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(data, DATA_TYPE, (__global DATA_TYPE *)out.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/crop_tensor.cl
deleted file mode 100644
index d9090dc838..0000000000
--- a/src/core/CL/cl_kernels/crop_tensor.cl
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) // Compile time constants
-
-/** Performs a tensor cropping.
- *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  in_offset_y                       The initial offset of the input address along Y.
- * @param[in]  in_offset_z                       The initial offset of the input address along Z.
- */
-__kernel void crop_tensor(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out),
-    int in_offset_y,
-    int in_offset_z)
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    const int in_x = get_global_id(0) * (in_step_x / in_stride_x);
-
-#if defined(WIDTH_FLIPPED)
-    const int in_y = in_offset_y - get_global_id(1);
-#else  // defined(WIDTH_FLIPPED)
-    const int in_y                 = in_offset_y + get_global_id(1);
-#endif // defined(WIDTH_FLIPPED)
-
-#if defined(HEIGHT_FLIPPED)
-    const int in_z = in_offset_z - get_global_id(2);
-#else  // defined(HEIGHT_FLIPPED)
-    const int in_z                 = in_offset_z + get_global_id(2);
-#endif // defined(HEIGHT_FLIPPED)
-
-#if defined(VEC_SIZE)
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does then shift access vector to access elements within bounds
-    const int shift = max((int)(get_global_id(0) * VEC_SIZE) - (int)LAST_ACCESSED_X, 0);
-    in.ptr -= shift * in.stride_x;
-    out.ptr -= shift * out.stride_x;
-#endif // defined(LAST_ACCESSED_X)
-
-    __global const uchar *input_addr = tensor3D_offset(&in, in_x, in_y, in_z);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE)), 0, (__global float *)out.ptr);
-#else  // defined(VEC_SIZE)
-    *((__global float *)(out.ptr)) = CONVERT(*((__global DATA_TYPE *)tensor3D_offset(&in, in_x, in_y, in_z)), float);
-#endif // defined(VEC_SIZE)
-}
-
-#endif // defined(DATA_TYPE) && defined(LAST_ACCESSED_X)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/deconvolution_layer.cl
deleted file mode 100644
index b1d5e61476..0000000000
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function applies upsample on an input image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All.
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void deconvolution_upsample(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    // Store result
-    *((__global DATA_TYPE *)dst.ptr) = *((__global DATA_TYPE *)src.ptr);
-}
-
-#if defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
-/** This kernel reshapes the deconvolution output tensor before returning the result of the Deconvolution. The decovnolution output tensor
- * is the result of a @ref CLGEMM operation between the deconvolution input and the deconvolution filter
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g., -DDATA_TYPE=F32
- * @note The width of the filter should be given as a preprocessor argument using -DFILTER_WIDTH=width, e.g., -DFILTER_WIDTH=2
- * @note The height of the filter should be given as a preprocessor argument using -DFILTER_HEIGHT=height, e.g., -DFILTER_HEIGHT=2
- * @note The width of the input should be given as a preprocessor argument using -DSRC_WIDTH=width, e.g., -DSRC_WIDTH=10
- * @note The height of the input should be given as a preprocessor argument using -DSRC_HEIGHT=width, e.g., -DSRC_HEIGHT=10
- * @note The output data layout is NHWC if the preprocessor argument NUM_FILTERS is defined, NCHW if NUM_FILTERS is not defined
- *
- * @param[in]  src_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in]  src_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_ptr                            Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination image
- * @param[in]  bias_ptr                           (Optional) Pointer to the biases vector. Supported data types: F16/F32/S32
- * @param[in]  bias_stride_x                      (Optional) Stride of the biases vector in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
- */
-__kernel void deconvolution_reshape(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst)
-#if defined(ADD_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(ADD_BIAS)
-)
-{
-#define FILTER_AREA ((FILTER_WIDTH) * (FILTER_HEIGHT))
-
-    Tensor3D        src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D        dst  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(dst);
-    const DATA_TYPE data = *(__global DATA_TYPE *)src.ptr;
-
-    // Store result
-    const int x_in = get_global_id(0);
-    const int y_in = get_global_id(1);
-    const int z_in = get_global_id(2);
-
-#if defined(NUM_FILTERS)
-    const int bias_index = x_in / (FILTER_AREA);
-    const int z_out      = bias_index + (NUM_FILTERS) * (z_in / (SRC_HEIGHT));
-    const int x_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
-    const int y_out      = (FILTER_HEIGHT) * (z_in % (SRC_HEIGHT)) + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
-#else  // defined(NUM_FILTERS)
-    const int x_out      = x_in / (FILTER_AREA);
-    const int y_out      = x_in % (FILTER_WIDTH) + y_in * (FILTER_WIDTH);
-    const int z_out      = (FILTER_HEIGHT) * z_in + ((x_in % (FILTER_AREA)) / (FILTER_WIDTH));
-    const int bias_index = x_out;
-#endif // defined(NUM_FILTERS)
-
-#if defined(ADD_BIAS)
-    Vector          bias     = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-    const DATA_TYPE bias_val = *(__global DATA_TYPE *)vector_offset(&bias, bias_index);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data + bias_val;
-#else  // defined(ADD_BIAS)
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, x_out, y_out, z_out)) = data;
-#endif // defined(ADD_BIAS)
-
-#undef FILTER_AREA
-}
-#endif // defined(FILTER_WIDTH) && defined(FILTER_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE)
diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index f301e64d66..0000000000
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Depth to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void depth_to_space_nchw(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % r;
-
-    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
-    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Depth to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void depth_to_space_nhwc(
-    TENSOR3D_DECLARATION(input),
-    const int batch_id,
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0) % r;
-
-    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
-    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
deleted file mode 100644
index 127f67d940..0000000000
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-/** This performs the dequantization of 8-bit unsigned integers to floating point.
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
- * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void dequantization_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale and offset vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = SCALE;
-
-    const VEC_DATA_TYPE(int, VEC_SIZE)
-    voffset = OFFSET;
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    __global float *scale)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = scale[get_global_id(2)];
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nhwc(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    __global float *scale)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
-
-    // Load data
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
-    // Create scale vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
-
-    // Dequantize
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else  // !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl
deleted file mode 100644
index 75a7a0f004..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution.cl
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "activation_float_helpers.h"
-#include "helpers.h"
-#include "helpers_asymm.h"
-#include "tile_helpers.h"
-
-//! @cond Doxygen_Suppress
-/** OpenCL kernel to compute the direct convolution.
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
- * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
- * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
- * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
- * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
- * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
- * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
- * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
- * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
- * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
- * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
- * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
- * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
- * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, .... n
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
- *
- *@note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
- * - -DIS_QUANTIZED
- * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
- * - The destination quantization shift e.g. -DDST_SHIFT=4
- * - The destination offset e.g. -DDST_OFFSET=4
- * - The source offset e.g. -DSRC_OFFSET=4
- * - The weights offset e.g. -DWEI_OFFSET=4
- * - The quantized zero value e.g. -DZERO_VALUE=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
- * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
- * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
- * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- */
-//! @endcond
-__kernel void direct_convolution_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bia)
-#endif // defined(HAS_BIAS)
-)
-{
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _IWEI_WIDTH WEI_WIDTH
-#define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _ISRC_CHANNELS SRC_CHANNELS
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _IDST_CHANNELS DST_CHANNELS
-#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
-
-    // If quantized, the output tile has to be quantized first before being stored to global memory
-#if defined(IS_QUANTIZED)
-#define _IOUTPUT_TILE cq
-#else // defined(IS_QUANTIZED)
-#define _IOUTPUT_TILE c
-#endif // defined(IS_QUANTIZED)
-
-    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
-
-    // .v    = access the whole vector (OpenCL vector)
-    // .s[x] = access the vector element at position x (scalar access)
-    TILE(int, M0, 1, xi);
-    TILE(int, M0, 1, yi);
-
-    // Convert the linear index to coordinate
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
-        yi[i].v = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
-        xi[i].v -= PAD_LEFT;
-        yi[i].v -= PAD_TOP;
-    })
-
-    // Initialize the accumulators
-    TILE(ACC_DATA_TYPE, M0, N0, c);
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        c[i].v = 0;
-    })
-
-    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
-    {
-        int ck = 0;
-        int xk = i % _IWEI_WIDTH;
-        int yk = i / _IWEI_WIDTH;
-
-        int k = 0;
-        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
-        {
-            TILE(SRC_DATA_TYPE, M0, K0, a);
-            TILE(WEI_DATA_TYPE, N0, K0, b);
-
-            LOOP_UNROLLING(int, i, 0, 1, M0,
-            {
-                a[i].v = ZERO_VALUE;
-            })
-
-            // Load tile from the src tensor
-            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
-
-            // Load tile from the weights tensor
-            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
-
-            // Compute the matrix multiplication between two tiles
-            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
-
-            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
-            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
-            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-            ck += K0;
-        }
-
-        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
-        // This #if directive should be removed in case of dynamic tensor support
-#if((SRC_CHANNELS % K0) != 0)
-        // Left-over accumulations
-        for(; k < _ISRC_CHANNELS; ++k)
-        {
-            TILE(SRC_DATA_TYPE, M0, 1, a);
-            TILE(WEI_DATA_TYPE, N0, 1, b);
-
-            LOOP_UNROLLING(int, i, 0, 1, M0,
-            {
-                a[i].v = ZERO_VALUE;
-            })
-
-            // Load tile from the src tensor
-            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
-
-            // Load tile from the weights tensor
-            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
-            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
-
-            // Compute the matrix multiplication between two tiles
-            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
-
-            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
-            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
-            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
-
-            ++ck;
-        }
-#endif // ((SRC_CHANNELS % K0) != 0)
-    }
-
-    // Offset correction required for the quantized asymmetric computation
-    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
-    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
-
-#if defined(HAS_BIAS)
-    TILE(BIA_DATA_TYPE, 1, N0, bias0);
-
-    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
-
-#endif // HAS_BIAS
-
-    TILE(uint, M0, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
-        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
-    })
-
-    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
-#if defined(IS_QUANTIZED)
-
-    TILE(DST_DATA_TYPE, M0, N0, cq);
-
-    // Quantize the tile
-    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
-#endif // defined(IS_QUANTIZED)
-
-    // Apply activation
-    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
-
-    // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
-
-#undef _IWEI_WIDTH
-#undef _IWEI_HEIGHT
-#undef _ISRC_WIDTH
-#undef _ISRC_HEIGHT
-#undef _ISRC_CHANNELS
-#undef _IDST_WIDTH
-#undef _IDST_HEIGHT
-#undef _IDST_CHANNELS
-#undef _IY_MULTIPLIER
-}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/direct_convolution1x1.cl
deleted file mode 100644
index 8ab2d1d4ea..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ /dev/null
@@ -1,316 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#define ADD_OP(a, b) ((a) + (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define CONVERT_SAT(a, b) ((a))
-
-#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 3
-#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
-#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
-#elif STRIDE_X == 2
-#define INPUT_PIXEL(data_size) extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_PIXEL(data_size) extract_input_stride1
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X == 3 */
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
-{
-    return vload8(0, input_pixel);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp = vload16(0, input_pixel);
-    return temp.s02468ace;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp1 = vload4(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp2 = vload4(0, input_pixel + 6);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp3 = vload4(0, input_pixel + 12);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp4 = vload4(0, input_pixel + 18);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp1 = vload8(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp2 = vload8(0, input_pixel + 8);
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp3 = vload8(0, input_pixel + 16);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_pixel Pointer to the first pixel.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp1 = vload16(0, input_pixel);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp2 = vload16(0, input_pixel + 12);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
-}
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-#endif /* defined(HAS_BIAS) */
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values = 0;
-
-    const uint z_index = get_global_id(2);
-
-    weights.ptr += z_index * weights_stride_w;
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
-        values      = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
-        src.ptr += src_stride_z;
-        weights.ptr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
-    ({                                                 \
-        acc.s0 = mad(src.s0, weight_value, acc.s0);    \
-        acc.s1 = mad(src.s1, weight_value, acc.s1);    \
-        acc.s2 = mad(src.s2, weight_value, acc.s2);    \
-        acc.s3 = mad(src.s3, weight_value, acc.s3);    \
-    })
-
-/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases, -DHAS_BIAS must to be passed at compile
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution1x1_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 acc0 = 0.0f;
-    float4 acc1 = 0.0f;
-    float4 acc2 = 0.0f;
-    float4 acc3 = 0.0f;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights
-        float weight = *((__global float *)weights_addr);
-
-        // Load values from row0 of input tensor
-        float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-        float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-        float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-        float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-
-        CONVOLUTION1x1_BIFROST(acc0, src0, weight);
-        CONVOLUTION1x1_BIFROST(acc1, src1, weight);
-        CONVOLUTION1x1_BIFROST(acc2, src2, weight);
-        CONVOLUTION1x1_BIFROST(acc3, src3, weight);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    acc0.s0 += bias;
-    acc0.s1 += bias;
-    acc0.s2 += bias;
-    acc0.s3 += bias;
-    acc1.s0 += bias;
-    acc1.s1 += bias;
-    acc1.s2 += bias;
-    acc1.s3 += bias;
-    acc2.s0 += bias;
-    acc2.s1 += bias;
-    acc2.s2 += bias;
-    acc2.s3 += bias;
-    acc3.s0 += bias;
-    acc3.s1 += bias;
-    acc3.s2 += bias;
-    acc3.s3 += bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-    vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/direct_convolution3x3.cl
deleted file mode 100644
index 811df053c4..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#define ADD_OP(a, b) ((a) + (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define CONVERT_SAT(a, b) ((a))
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                                                  \
-    ({                                                                                                                                             \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                                \
-        weights_values0 = vload3(0, weights_row_ptr);                                                                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                                \
-        src0 = vload8(0, src_row_ptr);                                                                                                             \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                                \
-        src1 = vload2(0, src_row_ptr + 8);                                                                                                         \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                          \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                                               \
-    ({                                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                             \
-        weights_values0 = vload3(0, weights_row_ptr);                                                                                           \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                            \
-        src0           = vload16(0, src_row_ptr);                                                                                               \
-        DATA_TYPE src1 = *(src_row_ptr + 16);                                                                                                   \
-        \
-        acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                  \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1));      \
-        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
-    })
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note This OpenCL kernel works with stride_x = 1 and 2
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
-    values0 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
-    ({                                                        \
-        acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0);       \
-        acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1);       \
-        acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2);       \
-        acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3);       \
-        acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0);       \
-        acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1);       \
-        acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2);       \
-        acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3);       \
-        acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0);       \
-        acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1);       \
-        acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2);       \
-        acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3);       \
-    })
-
-/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note In case biases, -DHAS_BIAS must to be passed at compile
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution3x3_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 values0 = 0;
-    float4 values1 = 0;
-    float4 values2 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights
-        float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-        float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-        float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-        float4 src0;
-        float2 src1;
-
-        // Load values from row0 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
-
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0);
-
-        // Load values from row1 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1);
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0);
-
-        // Load values from row2 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2);
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1);
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0);
-
-        // Load values from row3 of input tensor
-        src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2);
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1);
-
-        // Row4
-        src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
-        src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    values0 += (float4)bias;
-    values1 += (float4)bias;
-    values2 += (float4)bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-    vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl
deleted file mode 100644
index 59d668f0bf..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X == 2 */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                               \
-    ({                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
-        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                             \
-        src0 = vload8(0, src_row_ptr);                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        src1 = vload4(0, src_row_ptr + 8);                                                                                      \
-        \
-        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                          \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                               \
-    ({                                                                                                                          \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
-        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
-        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                            \
-        src0 = vload16(0, src_row_ptr);                                                                                         \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
-        src1 = vload4(0, src_row_ptr + 16);                                                                                     \
-        acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                     \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;         \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
-        \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
-        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
-    })
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    values0 = 0;
-
-    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
-#endif /* defined(HAS_BIAS) */
-
-    vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
-
-#if defined(WEIGHTS_DEPTH)
-
-#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
-    ({                                                                  \
-        acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0);                \
-        acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1);                \
-        acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2);                \
-        acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3);                \
-        acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0);                \
-        acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1);                \
-        acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2);                \
-        acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3);                \
-        acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0);                \
-        acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1);                \
-        acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2);                \
-        acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3);                \
-        acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0);                \
-        acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1);                \
-        acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2);                \
-        acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3);                \
-        acc.s0 = mad(src0.s4, weights_row01, acc.s0);                   \
-        acc.s1 = mad(src0.s5, weights_row01, acc.s1);                   \
-        acc.s2 = mad(src0.s6, weights_row01, acc.s2);                   \
-        acc.s3 = mad(src0.s7, weights_row01, acc.s3);                   \
-    })
-
-/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
- *
- * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution5x5_f32_bifrost(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    // Get the kernel index
-    const int kernel_index = get_global_id(2);
-
-    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    float4 values0 = 0.0f;
-    float4 values1 = 0.0f;
-
-    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
-    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
-
-    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
-
-    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
-    {
-        // Load the weights from row0 and row1
-        float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
-        float  weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
-        float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
-        float  weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
-        float8 src0;
-
-        // Load values from row0 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-
-        // Load values from row1 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        // Load values from row2 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
-
-        // Load weights from row2
-        weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
-        weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
-
-        // Load values from row3 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
-
-        // Load weights from row3
-        weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
-        weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        // Load values from row4 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
-
-        // Load weights from row4
-        weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
-        weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
-
-        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
-
-        // Load values from row5 of input tensor
-        src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
-
-        // Accumulate
-        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-
-    float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
-
-    values0 += bias;
-    values1 += bias;
-#endif /* defined(HAS_BIAS) */
-
-    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
-    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
-}
-#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
deleted file mode 100644
index b80d4f587e..0000000000
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ /dev/null
@@ -1,308 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#undef CONVERT_SAT_STR
-#undef CONVERT_SAT
-
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
-
-#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
-
-#if KERNEL_SIZE == 9
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                             \
-    ({                                                                                                        \
-        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                     \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                          \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                       \
-        acc += (src0.lo + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                        \
-        acc += ((int8)(src0.s1234, src0.s5678) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s2345, src0.s6789) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3456, src0.s789A) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s4567, src0.s89AB) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s5678, src0.s9ABC) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s6789, src0.sABCD) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s789A, src0.sBCDE) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s89AB, src0.sCDEF) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                             \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                                  \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                               \
-        int8  src1            = convert_int8(vload8(0, src_row_ptr + 16));                                            \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s8ACE, src1.s0246) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);             \
-    })
-
-#elif KERNEL_SIZE == 5
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                              \
-        int  weights_value1  = convert_int(*(weights_row_ptr + 4));                                                   \
-        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                  \
-        int4 src1            = convert_int4(vload4(0, src_row_ptr + 8));                                              \
-        acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s345, src0.s67, src1.s012) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s45, src0.s67, src1.s0123) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int4  weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                             \
-        int   weights_value1  = convert_int(*(weights_row_ptr + 4));                                                  \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                               \
-        int4  src1            = convert_int4(vload4(0, src_row_ptr + 16));                                            \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                              \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);         \
-        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
-    })
-
-#elif KERNEL_SIZE == 3
-
-#if STRIDE_X == 1
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
-#elif STRIDE_X == 2
-#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
-#else /* STRIDE_X not equals 1 or 2 */
-#error "STRIDE_X larger than 2 is not supported"
-#endif /* STRIDE_X */
-
-#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                     \
-    ({                                                                                                                \
-        int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                              \
-        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                  \
-        int2 src1            = convert_int2(vload2(0, src_row_ptr + 8));                                              \
-        acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                                   \
-        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
-        acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-    })
-
-#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                  \
-    ({                                                                                                             \
-        int3  weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                          \
-        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                            \
-        int   src1            = convert_int(*(src_row_ptr + 16));                                                  \
-        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                           \
-        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);      \
-        acc += ((int8)(src0.s2468, src0.sACE, src1) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
-    })
-
-#elif KERNEL_SIZE == 1
-
-#if STRIDE_X == 3
-#define INPUT_VALUE extract_input_stride3
-#elif STRIDE_X == 2
-#define INPUT_VALUE extract_input_stride2
-#elif STRIDE_X == 1
-#define INPUT_VALUE extract_input_stride1
-
-#else /* STRIDE_X not equals 1, 2 or 3 */
-#error "Only support strides 1, 2 and 3"
-#endif /* STRIDE_X */
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value)
-{
-    return vload8(0, input_value);
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp = vload16(0, input_value);
-    return temp.s02468ace;
-}
-
-/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
- *
- * @param[in] input_value Pointer to the first value.
- *
- * @return extracted input values.
- */
-inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp1 = vload16(0, input_value);
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    temp2 = vload16(0, input_value + 12);
-    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
-}
-
-#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */
-#error "Only kernel sizes 1, 3, 5 and 9 are supported"
-#endif /* KERNEL_SIZE */
-
-/** This kernel performs a direct convolution to convolve the low three dimensions.
- *
- * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
- * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
- * @note If biases are used then -DHAS_BIAS has to be passed at compile time
- * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
- * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
- * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
- * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
- * @note The destination offset quantization parameter must be passed at compile time using -DOUTPUT_OFFSET e.g. -DOUTPUT_OFFSET=3
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
- * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in]  biases_ptr                            Pointer to the biases tensor. Supported data types: S32
- * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
- * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
- */
-__kernel void direct_convolution_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(weights),
-#ifdef HAS_BIAS
-    VECTOR_DECLARATION(biases),
-#endif /* defined(HAS_BIAS) */
-    unsigned int weights_stride_w)
-{
-    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
-    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
-    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int8 values0 = 0;
-
-    __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);
-    __global DATA_TYPE *src_addr     = (__global DATA_TYPE *)offset(&src, 0, 0);
-
-    const int kernel_index = get_global_id(2);
-    weights_addr += kernel_index * weights_stride_w;
-
-    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
-    {
-#if KERNEL_SIZE == 9
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y));
-        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y));
-#elif KERNEL_SIZE == 5
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
-        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
-#elif KERNEL_SIZE == 3
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
-        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
-#elif KERNEL_SIZE == 1
-        int weight       = convert_int(*(__global DATA_TYPE *)weights_addr);
-        int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr));
-        values0 += (input_value + INPUT_OFFSET) * ((int8)weight + WEIGHTS_OFFSET);
-#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
-
-        src_addr += src_stride_z;
-        weights_addr += weights_stride_z;
-    }
-
-#ifdef HAS_BIAS
-    Vector        biases    = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
-    __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));
-    values0 += (int8)(*bias_addr);
-#endif /* defined(HAS_BIAS) */
-
-#if OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#else  // OUTPUT_SHIFT < 0
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
-#endif // OUTPUT_SHIFT < 0
-    values0 = values0 + OUTPUT_OFFSET;
-
-    vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
deleted file mode 100644
index 1f940001f3..0000000000
--- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
+++ /dev/null
@@ -1,219 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "activation_float_helpers.h"
-#include "helpers.h"
-#include "helpers_asymm.h"
-#include "tile_helpers.h"
-
-#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
-//! @cond Doxygen_Suppress
-/** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16)
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
- * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
- * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
- * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
- * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
- * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
- * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
- * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
- * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
- * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
- * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
- * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
- * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
- * @note Only the following configurations of M0 and N0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
- *  - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
- * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
- * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
- * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
- * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- */
-//! @endcond
-__kernel void dwc_native_fp_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bia)
-#endif // defined(HAS_BIAS)
-)
-{
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _IWEI_WIDTH WEI_WIDTH
-#define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _IDST_CHANNELS DST_CHANNELS
-#define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
-#define _IN0_A N0          // Cols tile A
-#define _IM0_B _IWEI_WIDTH // Rows tile B
-#define _IN0_B N0          // Cols tile B
-#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
-
-    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
-    const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
-#if defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else                                                         // defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
-    const int bout = 0; // BATCH SIZE IDX
-#endif                                                        // defined(BATCHED_EXECUTION)
-
-    int xi = xo * STRIDE_X;
-    int yi = yo * STRIDE_Y;
-    xi -= PAD_LEFT;
-    yi -= PAD_TOP;
-
-    int d = 0;
-#if DEPTH_MULTIPLIER != 1
-    for(; d < DEPTH_MULTIPLIER; d++)
-#endif // DEPTH_MULTIPLIER != 1
-    {
-        TILE(ACC_DATA_TYPE, M0, N0, c);
-
-        // Reset accumulators
-        LOOP_UNROLLING(int, i, 0, 1, M0,
-        {
-            c[i].v = 0;
-        })
-
-#if _IWEI_HEIGHT <= 5
-        LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
-#else // _IWEI_HEIGHT <= 5
-        for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
-#endif // _IWEI_HEIGHT <= 5
-        {
-            TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
-
-            LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
-            {
-                a[i].v = 0;
-            })
-
-            // Load tile from the src tensor (TILE A)
-            T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
-
-            TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
-
-            // Load tile from the weights tensor (TILE B)
-            T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b);
-
-            // Optimized path for STRIDE_X == 1
-            // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-                {
-                    c[m0].v += a[xk + m0].v * b[xk].v;
-                })
-            })
-        }
-#if _IWEI_HEIGHT <= 5
-        )
-#endif // _IWEI_HEIGHT <= 5
-
-#if defined(HAS_BIAS)
-        TILE(BIA_DATA_TYPE, 1, N0, bias0);
-
-        T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0);
-
-        // c = c + bias[broadcasted]
-        T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
-#endif // HAS_BIAS
-
-        T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
-
-        TILE(uint, M0, 1, dst_indirect_y);
-
-        bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
-        if(x_cond)
-        {
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
-                VSTORE_PARTIAL(N0, PARTIAL_N0)
-                (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
-            })
-        }
-        else
-        {
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
-                VSTORE(N0)
-                (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
-            })
-        }
-    }
-}
-#endif // defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
deleted file mode 100644
index aa6ba4de39..0000000000
--- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
+++ /dev/null
@@ -1,284 +0,0 @@
-/*
- * Copyright (c) 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE)
-#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE
-#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0)
-#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_uchar (0)
-#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_char (128)
-#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_uchar (-128)
-
-#define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \
-    ({})
-
-#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL()                                                                           \
-    TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers);                                                                 \
-    TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts);                                                                           \
-    T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, multipliers); \
-    T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, shifts);
-
-#define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE)
-#define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE()
-
-#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
-//! @cond Doxygen_Suppress
-/** OpenCL kernel to compute the depthwise convolution for quantized data types
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
- * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
- * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
- * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
- * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
- * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
- * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
- * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
- * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=int8)
- * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=int8)
- * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=int8)
- * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=int)
- * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
- * @note The activation type must be passed at compile using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note The A and B variables required by some activation functions must be passed at compile time using -DA_VAL= and -DB_VAL= respectively
- * @note The quantization offset used for both the per-tensor and per-channel quantization must be passed at compile using -DDST_OFFSET (e.g., -DDST_OFFSET=3)
- * @note The quantization shift for the per-tensor quantization must be passed at compile time using -DDST_SHIFT (e.g., -DDST_SHIFT=1)
- * @note The quantization multiplier for the per-tensor quantization must be passed at compile using -DDST_MULTIPLIER (e.g., -DDST_MULTIPLER=121432)
- * @note Only the following configurations of M0 and N0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
- *  - N0 = 2, 3, 4, 8, 16
- * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
- *
- * @param[in]  src_ptr                                       Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
- * @param[in]  src_stride_x                                  Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                                    src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                                  Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                                    src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                                  Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                                    src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                                  Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                                    src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes             The offset of the first element in the source tensor
- * @param[out] dst_ptr                                       Pointer to the destination tensor. Supported data type: same as @p src_ptr
- * @param[in]  dst_stride_x                                  Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                                    dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                  Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                                    dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                                  Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                                    dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                                  Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                                    dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes             The offset of the first element in the destination tensor
- * @param[in]  wei_ptr                                       Pointer to the weights tensor. Supported data type: same as @p src_ptr
- * @param[in]  wei_stride_x                                  Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  wei_step_x                                    wei_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  wei_stride_y                                  Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  wei_step_y                                    wei_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  wei_stride_z                                  Stride of the weights tensor in Z dimension (in bytes)
- * @param[in]  wei_step_z                                    wei_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  wei_stride_w                                  Stride of the weights tensor in W dimension (in bytes)
- * @param[in]  wei_step_w                                    wei_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  wei_offset_first_element_in_bytes             The offset of the first element in the weights tensor
- * @param[in]  dst_multipliers_ptr                           Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32
- * @param[in]  dst_multipliers_stride_x                      Stride of the destination multipliers tensor in X dimension (in bytes)
- * @param[in]  dst_multipliers_step_x                        dst_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_multipliers_offset_first_element_in_bytes The offset of the first element in the destination multipliers tensor
- * @param[in]  dst_shifts_ptr                                Pointer to the destination shifts tensor for the per-channel quantization. Supported data type: S32
- * @param[in]  dst_shifts_stride_x                           Stride of the destination shifts tensor in X dimension (in bytes)
- * @param[in]  dst_shifts_step_x                             dst_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_shifts_offset_first_element_in_bytes      The offset of the first element in the destination shifts tensor
- * @param[in]  bia_ptr                                       (Optional) Pointer to the bias tensor Supported data type: S32
- * @param[in]  bia_stride_x                                  (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  bia_step_x                                    (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bia_offset_first_element_in_bytes             (Optional) The offset of the first element in the bias tensor
- */
-//! @endcond
-__kernel void dwc_native_quantized_nhwc(
-    TENSOR4D(src, SRC_TENSOR_TYPE),
-    TENSOR4D(dst, DST_TENSOR_TYPE),
-    TENSOR4D(wei, WEI_TENSOR_TYPE),
-    VECTOR_DECLARATION(dst_multipliers),
-    VECTOR_DECLARATION(dst_shifts)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bia)
-#endif // defined(HAS_BIAS)
-)
-{
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _IWEI_WIDTH WEI_WIDTH
-#define _IWEI_HEIGHT WEI_HEIGHT
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _IDST_CHANNELS DST_CHANNELS
-#define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
-#define _IN0_A N0          // Cols tile A
-#define _IM0_B _IWEI_WIDTH // Rows tile B
-#define _IN0_B N0          // Cols tile B
-#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
-
-    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
-    const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
-#if defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
-    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
-#else                                                         // defined(BATCHED_EXECUTION)
-    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
-    const int bout = 0;                        // BATCH SIZE IDX
-#endif                                                        // defined(BATCHED_EXECUTION)
-
-    int xi = xo * STRIDE_X;
-    int yi = yo * STRIDE_Y;
-    xi -= PAD_LEFT;
-    yi -= PAD_TOP;
-
-    int d = 0;
-#if DEPTH_MULTIPLIER != 1
-    for(; d < DEPTH_MULTIPLIER; d++)
-#endif // DEPTH_MULTIPLIER != 1
-    {
-        TILE(ACC_DATA_TYPE, M0, N0, c);
-
-        // Reset accumulators
-        LOOP_UNROLLING(int, i, 0, 1, M0,
-        {
-            c[i].v = 0;
-        })
-
-#if _IWEI_HEIGHT <= 5
-        LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
-#else // _IWEI_HEIGHT <= 5
-        for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
-#endif // _IWEI_HEIGHT <= 5
-        {
-            TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
-
-            LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
-            {
-                a[i].v = ZERO_VALUE;
-            })
-
-            // Load tile from the src tensor (TILE A)
-            T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
-
-            TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
-
-            // Load tile from the weights tensor (TILE B)
-            T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b);
-
-            // Optimized path for STRIDE_X == 1
-            // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                LOOP_UNROLLING(int, n0, 0, 1, N0,
-                {
-#if _IWEI_WIDTH <= 16
-#define DOT_DATA_TYPE SRC_DATA_TYPE
-#define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
-
-                    // Optimized path for the dot instruction
-                    TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
-                    TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
-                    ACC_DATA_TYPE offset_a = 0;
-                    ACC_DATA_TYPE offset_b = 0;
-
-                    LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-                    {
-                        x0[0].s[xk] = a[xk + m0].s[n0];
-                        y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
-                    })
-                    DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
-                    REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
-                    REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
-                    c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
-#else  // _IWEI_WIDTH <= 16
-                    LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
-                    {
-                        c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
-                    })
-#endif // _IWEI_WIDTH <= 16
-                })
-            })
-        }
-#if _IWEI_HEIGHT <= 5
-        )
-#endif // _IWEI_HEIGHT <= 5
-
-#if _IWEI_WIDTH <= 16
-        T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
-#endif // _IWEI_WIDTH <= 16
-
-#if defined(HAS_BIAS)
-        TILE(BIA_DATA_TYPE, 1, N0, bias0);
-
-        // Load bias
-        T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0);
-
-        // c = c + bias[broadcasted]
-        T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
-#endif // HAS_BIAS
-
-        T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
-
-        // Quantize the tile
-        TILE(DST_DATA_TYPE, M0, N0, cq);
-        T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
-
-        // Perform activation
-        T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
-
-        bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
-
-        if(x_cond)
-        {
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
-                VSTORE_PARTIAL(N0, PARTIAL_N0)
-                (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
-            })
-        }
-        else
-        {
-            LOOP_UNROLLING(int, m0, 0, 1, M0,
-            {
-                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
-                VSTORE(N0)
-                (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
-            })
-        }
-    }
-}
-#endif // defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/elementwise_operation.cl
deleted file mode 100644
index 45dcbfc6e2..0000000000
--- a/src/core/CL/cl_kernels/elementwise_operation.cl
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE)
-
-/** List of all the operations supported by this kernel.
- * @note ADD and SUB operations, when executed on integers, support saturation */
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-#define MAX(x, y) max(x, y)
-#define MIN(x, y) min(x, y)
-#define SQUARED_DIFF(x, y) (x - y) * (x - y)
-#define POWER(x, y) pow(x, y)
-
-#if VEC_SIZE_OUT == 1
-#define PRELU(x, y) (x > 0 ? x : x * y)
-#else // VEC_SIZE_OUT == 1
-#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))))
-#endif // VEC_SIZE_OUT == 1
-
-#if defined(S32)
-#define DIV(x, y) CONVERT(floor(CONVERT(x, VEC_DATA_TYPE(float, VEC_SIZE_OUT)) / CONVERT(y, VEC_DATA_TYPE(float, VEC_SIZE_OUT))), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
-#else /* S32 */
-#define DIV(x, y) (x / y)
-#endif /* S32 */
-
-#define AND(x, y) (CONVERT((x && y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
-#define OR(x, y) (CONVERT((x || y), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)) & ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))1))
-
-#define OP_FUN_NAME_STR(op) elementwise_operation_##op
-#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
-
-#if defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-#endif // defined(ACTIVATION_TYPE)
-
-/** This function executes an element-wise operation among two tensors.
- *
- * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_OUT=3. It is defined as the remainder between the input's first dimension and VEC_SIZE_OUT
- * @note The input and output data_types need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: U8/S16/F16/F32
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8 (only if both inputs are U8), S16/F16/F32
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void OP_FUN_NAME(OP)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2)
-#if !defined(IN_PLACE)
-    ,
-    TENSOR3D_DECLARATION(out)
-#endif // !defined(IN_PLACE)
-)
-{
-#if VEC_SIZE_IN1 == 1
-    uint in1_x_offs = 0;
-#else  // VEC_SIZE_IN1 == 1
-    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
-#endif // VEC_SIZE_IN1 == 1
-#if VEC_SIZE_IN2 == 1
-    uint in2_x_offs = 0;
-#else  // VEC_SIZE_IN2 == 1
-    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
-#endif // VEC_SIZE_IN2 == 1
-#if !defined(IN_PLACE)
-    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-#endif // !defined(IN_PLACE)
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
-    __global        uchar *
-#if !defined(IN_PLACE)
-    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
-#else // !defined(IN_PLACE)
-#if defined(SRC1_IN_PLACE)
-    out_addr    = in1_addr;
-#else  //defined(SRC1_IN_PLACE)
-    out_addr = in2_addr;
-#endif //defined(SRC1_IN_PLACE)
-#endif // !defined(IN_PLACE)
-
-    // Load values
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
-    in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
-    in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT));
-
-    // Calculate and store result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
-    res0 = OP(in_a, in_b);
-#if defined(ACTIVATION_TYPE)
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
deleted file mode 100644
index a11be80875..0000000000
--- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define SUB(x, y) (x - y)
-#define ADD(x, y) (x + y)
-#define MAX(x, y) max((x), (y))
-#define MIN(x, y) min((x), (y))
-#define SQUARED_DIFF(x, y) (x - y) * (x - y)
-#define PRELU(x, y) (select(y * x, x, CONVERT((x > (DATA_TYPE)0), SELECT_VEC_DATA_TYPE(float, VEC_SIZE_OUT))))
-#define DIV(x, y) (x / y)
-
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-
-#define OP_FUN_NAME_STR(op) elementwise_operation_##op##_quantized
-#define OP_FUN_NAME(op) OP_FUN_NAME_STR(op)
-
-#if defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT)
-
-/** This function executes an element-wise operation among two tensors.
- *
- * @note Vector sizes of inputs and output have to be passed at compile time using -DVEC_SIZE_IN1, -DVEC_SIZE_IN2, -DVEC_SIZE_OUT.
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note In case of broadcasting along the X dimension the proper preprocessor argument should be passed depending on the input (e.g. -DIS_IN1_X_BROADCASTING, -DIS_IN2_X_BROADCASTING)
- * @note The quantization offset of the first operand must be passed at compile time using -DOFFSET_IN1, i.e. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time using -DOFFSET_IN2, i.e. -DOFFSET_IN2=10
- * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, i.e. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, i.e. -DSCALE_IN2=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @note The element-wise operation to be executed has to be passed at compile time using -DOP (e.g., -DOP=ADD)
- * @note For QSYMM16 operations OFFSET_IN1, OFFSET_IN2 and OFFSET_OUT must be set to zero
- * @note The data type must be passed at compile time using -DDATA_TYPE, i.e. -DDATA_TYPE=uchar
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor. Supported data types: QASYMM8/QSYMM16
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void OP_FUN_NAME(OP)(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2)
-#if !defined(IN_PLACE)
-    ,
-    TENSOR3D_DECLARATION(out)
-#endif // !defined(IN_PLACE)
-)
-{
-#if VEC_SIZE_IN1 == 1
-    uint in1_x_offs = 0;
-#else  // VEC_SIZE_IN1 == 1
-    uint in1_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN1 - (VEC_SIZE_IN1 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN1), 0);
-#endif // VEC_SIZE_IN1 == 1
-#if VEC_SIZE_IN2 == 1
-    uint in2_x_offs = 0;
-#else  // VEC_SIZE_IN2 == 1
-    uint in2_x_offs = max((int)(get_global_id(0) * VEC_SIZE_IN2 - (VEC_SIZE_IN2 - VEC_SIZE_LEFTOVER) % VEC_SIZE_IN2), 0);
-#endif // VEC_SIZE_IN2 == 1
-#if !defined(IN_PLACE)
-    uint out_x_offs = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-#endif // !defined(IN_PLACE)
-
-    // Get pixels pointer
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + in1_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in1_step_y + get_global_id(2) * in1_step_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + in2_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * in2_step_y + get_global_id(2) * in2_step_z;
-    __global        uchar *
-#if !defined(IN_PLACE)
-    out_addr = out_ptr + out_offset_first_element_in_bytes + out_x_offs * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
-#else // !defined(IN_PLACE)
-#if defined(SRC1_IN_PLACE)
-    out_addr    = in1_addr;
-#else  //defined(SRC1_IN_PLACE)
-    out_addr = in2_addr;
-#endif //defined(SRC1_IN_PLACE)
-#endif // !defined(IN_PLACE)
-
-    VEC_INT in_a = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE *)in1_addr)), VEC_INT);
-    VEC_INT in_b = CONVERT((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE *)in2_addr)), VEC_INT);
-
-    in_a = SUB(in_a, (VEC_INT)((int)OFFSET_IN1));
-    in_b = SUB(in_b, (VEC_INT)((int)OFFSET_IN2));
-
-    const VEC_FLOAT in1f32  = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
-    const VEC_FLOAT in2f32  = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-    const VEC_FLOAT qresf32 = OP(in1f32, in2f32) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
-    const VEC_TYPE  res0    = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
-
-    // Store result
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(OP) && defined(VEC_SIZE_IN1) && defined(VEC_SIZE_IN2) && defined(VEC_SIZE_OUT) && defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE) */
diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/elementwise_unary.cl
deleted file mode 100644
index d2d9d97d33..0000000000
--- a/src/core/CL/cl_kernels/elementwise_unary.cl
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-#if defined(DATA_TYPE) && defined(OPERATION)
-
-// Calculate exponential
-#define exp_op(input) exp(input)
-// Calculate reverse square root
-#define rsqrt_op(input) rsqrt(input)
-// Calculate negative
-#define neg_op(input) (-input)
-// Calculate sine
-#define sin_op(input) sin(input)
-// Calculate abs for floating point values
-#define fabs_op(input) fabs(input)
-// Calculate natural_log
-#define natural_log_op(input) log(input)
-// Calculate round (Cannot use round function as it rounds halfway cases away from zero).
-#if defined(VEC_SIZE)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, VEC_DATA_TYPE(int, VEC_SIZE), rte), VEC_TYPE)
-#define logical_not_op(input) CONVERT(CONVERT(!input, VEC_TYPE) & ((VEC_TYPE)0x1), VEC_TYPE)
-#else // defined(VEC_SIZE)
-#define round_op(input) CONVERT(CONVERT_SAT_ROUND(input, int, rte), DATA_TYPE)
-#define logical_not_op(input) ((!input) & 0x1)
-#endif // defined(VEC_SIZE)
-
-/** Applies element wise unary operator in a tensor.
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: F16/32.
- * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in_step_x                         in_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in_step_y                         in_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         in_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  Offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: F16/32.
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_step_y                        Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes Offset of the first element in the destination image
- */
-__kernel void elementwise_unary(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
-    out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-
-    VSTORE(VEC_SIZE)
-    (OPERATION(data), 0, (__global DATA_TYPE *)out.ptr);
-#else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE *)(out.ptr)) = (DATA_TYPE)(OPERATION(*((__global DATA_TYPE *)in.ptr)));
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-}
-#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/fft.cl
deleted file mode 100644
index 51763a620a..0000000000
--- a/src/core/CL/cl_kernels/fft.cl
+++ /dev/null
@@ -1,1880 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE)
-/** Calculates and applies the twiddle factor to a given input.
- *
- * @param[in]     phi   The angle.
- * @param[in,out] input The input on which the factor should be applied.
- */
-#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input)  \
-    {                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                \
-        w, tmp;                                    \
-        w.x   = cos(phi);                          \
-        w.y   = sin(phi);                          \
-        tmp.x = (w.x * input.x) - (w.y * input.y); \
-        tmp.y = (w.x * input.y) + (w.y * input.x); \
-        input = tmp;                               \
-    }
-
-/** Computes radix-2 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- */
-#define DFT_2(c0, c1)               \
-    {                               \
-        VEC_DATA_TYPE(DATA_TYPE, 2) \
-        v0;                         \
-        v0 = c0;                    \
-        c0 = v0 + c1;               \
-        c1 = v0 - c1;               \
-    }
-
-// radix-3 butterfly unit factors
-#define SQRT3DIV2 0.86602540378443f
-
-/** Computes radix-3 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- */
-#define DFT_3(c0, c1, c2)                             \
-    {                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
-        v0 = c1 + c2;                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                   \
-        v1   = c1 - c2;                               \
-        c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \
-        c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \
-        c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \
-        c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \
-        c0   = c0 + v0;                               \
-    }
-
-/**Computes radix-4 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- * @param[in,out] c3 Complex input 3.
- */
-#define DFT_4(c0, c1, c2, c3)       \
-    {                               \
-        VEC_DATA_TYPE(DATA_TYPE, 2) \
-        v0, v1, v2, v3;             \
-        v0   = c0 + c2;             \
-        v1   = c1 + c3;             \
-        v2   = c0 - c2;             \
-        v3.x = c1.y - c3.y;         \
-        v3.y = c3.x - c1.x;         \
-        c0   = v0 + v1;             \
-        c2   = v0 - v1;             \
-        c1   = v2 + v3;             \
-        c3   = v2 - v3;             \
-    }
-
-// radix-5 butterfly unit factors
-#define W5_A (DATA_TYPE)0.30901699437494f
-#define W5_B (DATA_TYPE)0.95105651629515f
-#define W5_C (DATA_TYPE)0.80901699437494f
-#define W5_D (DATA_TYPE)0.58778525229247f
-
-/** Computes radix-5 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- * @param[in,out] c3 Complex input 3.
- * @param[in,out] c4 Complex input 4.
- */
-#define DFT_5(c0, c1, c2, c3, c4)                                  \
-    {                                                              \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                \
-        v0, v1, v2, v3, v4;                                        \
-        v0 = c0;                                                   \
-        v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3);                  \
-        v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3);                  \
-        v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3);                  \
-        v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3);                  \
-        c0 = v0 + c1 + c2 + c3 + c4;                               \
-        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x); \
-        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v3.y, -v3.x); \
-        c3 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v3.y, v3.x); \
-        c4 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x); \
-    }
-
-// radix-7 butterfly unit factors
-#define W7_A (DATA_TYPE)0.62348980185873f
-#define W7_B (DATA_TYPE)0.78183148246802f
-#define W7_C (DATA_TYPE)0.22252093395631f
-#define W7_D (DATA_TYPE)0.97492791218182f
-#define W7_E (DATA_TYPE)0.90096886790241f
-#define W7_F (DATA_TYPE)0.43388373911755f
-
-/** Computes radix-7 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- * @param[in,out] c3 Complex input 3.
- * @param[in,out] c4 Complex input 4.
- * @param[in,out] c5 Complex input 5.
- * @param[in,out] c6 Complex input 6.
- */
-#define DFT_7(c0, c1, c2, c3, c4, c5, c6)                            \
-    {                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                  \
-        v0, v1, v2, v3, v4, v5, v6;                                  \
-        v0 = c0;                                                     \
-        v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \
-        v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \
-        v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \
-        v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \
-        v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \
-        v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \
-        c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6;                       \
-        c1 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v4.y, -v4.x);   \
-        c2 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v5.y, -v5.x);   \
-        c3 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(v6.y, -v6.x);   \
-        c4 = v0 - v3 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v6.y, v6.x);   \
-        c5 = v0 - v2 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v5.y, v5.x);   \
-        c6 = v0 + v1 + (VEC_DATA_TYPE(DATA_TYPE, 2))(-v4.y, v4.x);   \
-    }
-
-/** Computes radix-8 butterfly unit.
- *
- * @param[in,out] c0 Complex input 0.
- * @param[in,out] c1 Complex input 1.
- * @param[in,out] c2 Complex input 2.
- * @param[in,out] c3 Complex input 3.
- * @param[in,out] c4 Complex input 4.
- * @param[in,out] c5 Complex input 5.
- * @param[in,out] c6 Complex input 6.
- * @param[in,out] c7 Complex input 7.
- */
-#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \
-    {                                         \
-        VEC_DATA_TYPE(DATA_TYPE, 2)           \
-        v0, v1, v2, v3, v4, v5, v6, v7;       \
-        VEC_DATA_TYPE(DATA_TYPE, 2)           \
-        s0, s1, s2, s3, s4, s5, s6, s7;       \
-        VEC_DATA_TYPE(DATA_TYPE, 2)           \
-        t0, t1, t2;                           \
-        v0   = c0 + c4;                       \
-        v1   = c1 + c5;                       \
-        v2   = c2 + c6;                       \
-        v3   = c3 + c7;                       \
-        v4   = c0 - c4;                       \
-        v5   = c1 - c5;                       \
-        v6   = c2 - c6;                       \
-        v7   = c3 - c7;                       \
-        s0   = v0 + v2;                       \
-        s1   = v1 + v3;                       \
-        s2   = v0 - v2;                       \
-        s3   = v1 - v3;                       \
-        s4.x = v4.x - v6.y;                   \
-        s4.y = v4.y + v6.x;                   \
-        s5.x = v5.x - v7.y;                   \
-        s5.y = v5.y + v7.x;                   \
-        s6.x = v4.x + v6.y;                   \
-        s6.y = v4.y - v6.x;                   \
-        s7.x = v5.x + v7.y;                   \
-        s7.y = v5.y - v7.x;                   \
-        t0.x = -s3.y;                         \
-        t0.y = s3.x;                          \
-        t1.x = M_SQRT1_2_F * (s5.x - s5.y);   \
-        t1.y = M_SQRT1_2_F * (s5.x + s5.y);   \
-        t2.x = -M_SQRT1_2_F * (s7.x + s7.y);  \
-        t2.y = M_SQRT1_2_F * (s7.x - s7.y);   \
-        c0   = s0 + s1;                       \
-        c1   = s6 - t2;                       \
-        c2   = s2 - t0;                       \
-        c3   = s4 - t1;                       \
-        c4   = s0 - s1;                       \
-        c5   = s6 + t2;                       \
-        c6   = s2 + t0;                       \
-        c7   = s4 + t1;                       \
-    }
-
-/** Computes the first stage of a radix-2 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_2_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load two complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    data = vload4(0, (__global DATA_TYPE *)input.ptr);
-
-    // Compute DFT N = 2
-    DFT_2(data.s01, data.s23);
-
-    // Store two complex output values
-    vstore4(data, 0, (__global DATA_TYPE *)output.ptr);
-}
-
-/** Computes the first stage of a radix-2 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_2_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load two complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Compute DFT N = 2
-    DFT_2(data1, data2);
-
-    // Store two complex output values
-    vstore2(data1, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-}
-
-/** Computes the first stage of a radix-3 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_3_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load three complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    data0 = vload4(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2, 0, 0));
-
-    // Compute DFT N = 3
-    DFT_3(data0.s01, data0.s23, data1.s01);
-
-    // Store three complex output values
-    vstore4(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2, 0, 0));
-}
-
-/** Computes the first stage of a radix-3 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_3_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load three complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-
-    // Compute DFT N = 3
-    DFT_3(data0, data1, data2);
-
-    // Store three complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-}
-
-/** Computes the first stage of a radix-4 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_4_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load four complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data = vload8(0, (__global DATA_TYPE *)input.ptr);
-
-    // Compute DFT N = 4
-    DFT_4(data.s01, data.s23, data.s45, data.s67);
-
-    // Store four complex output values
-    vstore8(data, 0, (__global DATA_TYPE *)output.ptr);
-}
-
-/** Computes the first stage of a radix-4 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_4_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load four complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-
-    // Compute DFT N = 4
-    DFT_4(data0, data1, data2, data3);
-
-    // Store four complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
-}
-
-/** Computes the first stage of a radix-5 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_5_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load five complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
-
-    // Compute DFT N = 5
-    DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01);
-
-    // Store five complex output values
-    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
-}
-
-/** Computes the first stage of a radix-5 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_5_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load five complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
-
-    // Compute DFT N = 5
-    DFT_5(data0, data1, data2, data3, data4);
-
-    // Store five complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
-}
-
-/** Computes the first stage of a radix-7 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_7_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load seven complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data0 = vload8(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    data1 = vload4(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6, 0, 0));
-
-    // Compute DFT N = 7
-    DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01);
-
-    // Store seven complex output values
-    vstore8(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore4(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4, 0, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6, 0, 0));
-}
-
-/** Computes the first stage of a radix-7 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_7_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load seven complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
-
-    // Compute DFT N = 7
-    DFT_7(data0, data1, data2, data3, data4, data5, data6);
-
-    // Store seven complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
-    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
-    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
-}
-
-/** Computes the first stage of a radix-8 DFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_8_first_stage_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load eight complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)input.ptr);
-
-    // Compute DFT N = 8
-    DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF);
-
-    // Store eight complex output values
-    vstore16(data, 0, (__global DATA_TYPE *)output.ptr);
-}
-
-/** Computes the first stage of a radix-8 DFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- */
-__kernel void fft_radix_8_first_stage_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-)
-{
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
-
-    // Load eight complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7, 0));
-
-    // Compute DFT N = 8
-    DFT_8(data0, data1, data2, data3, data4, data5, data6, data7);
-
-    // Store eight complex output values
-    vstore2(data0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(data1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 1, 0));
-    vstore2(data2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2, 0));
-    vstore2(data3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3, 0));
-    vstore2(data4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4, 0));
-    vstore2(data5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5, 0));
-    vstore2(data6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6, 0));
-    vstore2(data7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7, 0));
-}
-
-/** Computes a stage of a radix-2 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_2_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-2
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load two complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-
-    // Compute DFT N = 2
-    DFT_2(c0, c1);
-
-    // Store two complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-2 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_2_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-2
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load two complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-
-    // Compute DFT N = 2
-    DFT_2(c0, c1);
-
-    // Store two complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-}
-
-/** Computes a stage of a radix-3 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_3_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-3
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load three complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-
-    // Compute DFT N = 3
-    DFT_3(c0, c1, c2);
-
-    // Store three complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-3 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_3_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-3
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load three complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-
-    // Compute DFT N = 3
-    DFT_3(c0, c1, c2);
-
-    // Store three complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-}
-
-/** Computes a stage of a radix-4 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_4_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-4
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load four complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-
-    // Compute DFT N = 4
-    DFT_4(c0, c1, c2, c3);
-
-    // Store four complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-4 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_4_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-4
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load four complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-
-    // Compute DFT N = 4
-    DFT_4(c0, c1, c2, c3);
-
-    // Store four complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-}
-
-/** Computes a stage of a radix-5 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_5_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-5
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load five complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-
-    // Compute DFT N = 5
-    DFT_5(c0, c1, c2, c3, c4);
-
-    // Store five complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-5 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_5_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-5
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load five complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-
-    // Compute DFT N = 5
-    DFT_5(c0, c1, c2, c3, c4);
-
-    // Store five complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
-}
-
-/** Computes a stage of a radix-7 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_7_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-7
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load seven complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
-    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
-
-    // Compute DFT N = 7
-    DFT_7(c0, c1, c2, c3, c4, c5, c6);
-
-    // Store seven complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
-    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
-    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-7 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_7_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-7
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load seven complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
-    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
-
-    // Compute DFT N = 7
-    DFT_7(c0, c1, c2, c3, c4, c5, c6);
-
-    // Store seven complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
-    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
-    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
-}
-
-/** Computes a stage of a radix-8 FFT on axis 0.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_8_axis_0(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-8
-    uint kx = get_global_id(0);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load eight complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 2 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 3 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 4 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 5 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 6 * Nx, 0, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 7 * Nx, 0, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
-    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
-    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
-
-    // Compute DFT N = 8
-    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
-
-    // Store eight complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, Nx, 0, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 2 * Nx, 0, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 3 * Nx, 0, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 4 * Nx, 0, 0));
-    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 5 * Nx, 0, 0));
-    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 6 * Nx, 0, 0));
-    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 7 * Nx, 0, 0));
-}
-
-/** Computes a stage of a radix-8 FFT on axis 1.
- *
- * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time
- *
- * @param[in,out] input_ptr                            Pointer to the source tensor. Supported data types: F16/f32
- * @param[in,out] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in,out] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in,out] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in,out] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in,out] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in,out] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in,out] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out]    output_ptr                           (Optional) Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]     output_stride_x                      (Optional) Stride of the destination image in X dimension (in bytes)
- * @param[in]     output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     output_stride_y                      (Optional) Stride of the destination image in Y dimension (in bytes)
- * @param[in]     output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     output_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]     output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination image
- * @param[in]     Nx                                   The butterfly span. Products of radix order of previous radix's stage
- * @param[in]     Ni                                   Nx * Ny.
- * @param[in]     exp_const                            Exponent constant
- */
-__kernel void fft_radix_8_axis_1(
-    TENSOR3D_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(output)
-#endif /* not IN_PLACE */
-    ,
-    uint Nx, uint Ni, float exp_const)
-{
-    // Each work-item computes a single radix-8
-    uint kx = get_global_id(1);
-
-    // Compute nx
-    uint nx = kx % Nx;
-
-    // Compute n index
-    uint n = nx + (kx / Nx) * Ni;
-
-    // Get tensor pointers
-    Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    input.ptr += get_global_id(0) * input.stride_x + n * input.stride_y + get_global_id(2) * input.stride_z;
-#ifdef IN_PLACE
-    Tensor3D output = input;
-#else  /* IN_PLACE */
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-    output.ptr += get_global_id(0) * output.stride_x + n * output.stride_y + get_global_id(2) * output.stride_z;
-#endif /* IN_PLACE */
-
-    // Load eight complex input values
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c0 = vload2(0, (__global DATA_TYPE *)input.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c1 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c2 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c3 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c4 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c5 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c6 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6 * Nx, 0));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    c7 = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 7 * Nx, 0));
-
-    // Compute phi
-    DATA_TYPE phi = (DATA_TYPE)nx * (DATA_TYPE)exp_const;
-
-    // Multiply by twiddle factor
-    TWIDDLE_FACTOR_MULTIPLICATION(phi, c1);
-    TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2);
-    TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3);
-    TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4);
-    TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5);
-    TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6);
-    TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7);
-
-    // Compute DFT N = 8
-    DFT_8(c0, c1, c2, c3, c4, c5, c6, c7);
-
-    // Store eight complex output values
-    vstore2(c0, 0, (__global DATA_TYPE *)output.ptr);
-    vstore2(c1, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, Nx, 0));
-    vstore2(c2, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 2 * Nx, 0));
-    vstore2(c3, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 3 * Nx, 0));
-    vstore2(c4, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 4 * Nx, 0));
-    vstore2(c5, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 5 * Nx, 0));
-    vstore2(c6, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 6 * Nx, 0));
-    vstore2(c7, 0, (__global DATA_TYPE *)tensor3D_offset(&output, 0, 7 * Nx, 0));
-}
-#endif // defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/fft_digit_reverse.cl
deleted file mode 100644
index de566212c6..0000000000
--- a/src/core/CL/cl_kernels/fft_digit_reverse.cl
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-/** Computes the digit reverse stage on axis X
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
- * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
- * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
- */
-__kernel void fft_digit_reverse_axis_0(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    VECTOR_DECLARATION(idx))
-{
-    // Get tensor pointers
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   idx = CONVERT_TO_VECTOR_STRUCT(idx);
-
-    const unsigned int iidx = *((__global uint *)(idx.ptr));
-
-    // Load data
-#if VEC_SIZE == 1
-    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2)));
-#else // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-
-    // Create result
-#if VEC_SIZE == 1
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = { data, 0 };
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = data;
-#else // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-
-    // Store result
-#if defined(CONJ)
-    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
-#else  // defined(CONJ)
-    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
-#endif // defined(CONJ)
-}
-
-/** Computes the digit reverse stage on axis Y
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx_ptr                           Pointer to the index tensor. Supported data types: U32
- * @param[in]  idx_stride_x                      Stride of the index tensor in X dimension (in bytes)
- * @param[in]  idx_step_x                        idx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  idx_offset_first_element_in_bytes The offset of the first element in the index tensor
- */
-__kernel void fft_digit_reverse_axis_1(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    VECTOR_DECLARATION(idx))
-{
-    // Get tensor pointers
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   idx = CONVERT_TO_VECTOR_STRUCT_NO_STEP(idx);
-
-    const unsigned int iidx = *((__global uint *)vector_offset(&idx, (int)(get_global_id(1))));
-
-    // Load data
-#if VEC_SIZE == 1
-    DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data = vload2(0, (__global DATA_TYPE *)tensor3D_offset(&src, get_global_id(0), iidx, get_global_id(2)));
-#else // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-
-    // Create result
-#if VEC_SIZE == 1
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = { data, 0 };
-#elif VEC_SIZE == 2
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = data;
-#else // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-
-    // Store result
-#if defined(CONJ)
-    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(res.s0, -res.s1), 0, (__global DATA_TYPE *)dst.ptr);
-#else  // defined(CONJ)
-    vstore2(res, 0, (__global DATA_TYPE *)dst.ptr);
-#endif // defined(CONJ)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/fft_scale.cl
deleted file mode 100644
index 57e25ef504..0000000000
--- a/src/core/CL/cl_kernels/fft_scale.cl
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE)
-/** Computes the fft scale stage
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        (Optional) dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        (Optional) dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      (Optional) Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        (Optional) dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- * @param[in]  scale                             Scale to apply to the complex value
- */
-__kernel void fft_scale_conj(
-    TENSOR3D_DECLARATION(src)
-#ifndef IN_PLACE
-    ,
-    TENSOR3D_DECLARATION(dst)
-#endif /* not IN_PLACE */
-    ,
-    float scale)
-{
-    // Get tensor pointers
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-#if defined(IN_PLACE)
-    Tensor3D dst = src;
-#else  /* IN_PLACE */
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-#endif /* IN_PLACE */
-
-    // Store result
-#if VEC_SIZE == 1
-    *((__global DATA_TYPE *)dst.ptr) = (*(__global DATA_TYPE *)src.ptr) / (DATA_TYPE)scale;
-#elif VEC_SIZE == 2
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    data = vload2(0, (__global DATA_TYPE *)src.ptr);
-    data /= (DATA_TYPE)scale;
-#if defined(CONJ)
-    vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(data.s0, -data.s1), 0, (__global DATA_TYPE *)dst.ptr);
-#else  // defined(CONJ)
-    vstore2(data, 0, (__global DATA_TYPE *)dst.ptr);
-#endif // defined(CONJ)
-#else  // VEC_SIZE == 1
-#error "vec_size of 1 and 2 are supported"
-#endif // VEC_SIZE == 1
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/fill_border.cl
deleted file mode 100644
index 5775d899e8..0000000000
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Fill N pixel of the padding edge of a single channel image by replicating the closest valid pixel.
- *
- * @attention  The DATA_TYPE needs to be passed at the compile time.
- * e.g. -DDATA_TYPE=int
- *
- * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
- * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
- *
- * @param[in,out] buf_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]     buf_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]     buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]     buf_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]     buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]     buf_stride_z                      Stride between images if batching images (in bytes)
- * @param[in]     buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]     buf_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]     width                             Width of the valid region of the image
- * @param[in]     height                            Height of the valid region of the image
- * @param[in]     start_pos                         XY coordinate indicating the start point of the valid region
- */
-__kernel void fill_image_borders_replicate(
-    TENSOR3D_DECLARATION(buf),
-    uint width,
-    uint height,
-    int2 start_pos)
-{
-    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
-
-    // Update pointer to point to the starting point of the valid region
-    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
-
-    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
-    const int gid0        = get_global_id(0);
-    const int gidH        = gid0 - total_width;
-    const int gidW        = gid0 - BORDER_SIZE_LEFT;
-
-    if(gidH >= 0)
-    {
-        // Handle left border
-        DATA_TYPE left_val = *(__global DATA_TYPE *)offset(&buf, 0, gidH);
-        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, i, gidH) = left_val;
-        }
-        // Handle right border
-        DATA_TYPE right_val = *(__global DATA_TYPE *)offset(&buf, width - 1, gidH);
-        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = right_val;
-        }
-    }
-    else
-    {
-        // Get value for corners
-        int val_idx = gidW;
-        if(gidW < 0 || gidW > (width - 1))
-        {
-            val_idx = gidW < 0 ? 0 : width - 1;
-        }
-
-        // Handle top border
-        DATA_TYPE top_val = *(__global DATA_TYPE *)offset(&buf, val_idx, 0);
-        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, gidW, i) = top_val;
-        }
-        // Handle bottom border
-        DATA_TYPE bottom_val = *(__global DATA_TYPE *)offset(&buf, val_idx, height - 1);
-        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = bottom_val;
-        }
-    }
-}
-
-/** Fill N pixels of the padding edge of a single channel image with a constant value.
- *
- * @attention  The DATA_TYPE needs to be passed at the compile time.
- * e.g. -DDATA_TYPE=int
- *
- * @attention  The border size for top, bottom, left, right needs to be passed at the compile time.
- * e.g. --DBORDER_SIZE_TOP=0 -DBORDER_SIZE_BOTTOM=2 -DBORDER_SIZE_LEFT=0 -DBORDER_SIZE_RIGHT=2
- *
- * @param[out] buf_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  buf_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  buf_step_x                        buf_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  buf_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  buf_step_y                        buf_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  buf_stride_z                      Stride between images if batching images (in bytes)
- * @param[in]  buf_step_z                        buf_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  buf_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  width                             Width of the valid region of the image
- * @param[in]  height                            Height of the valid region of the image
- * @param[in]  start_pos                         XY coordinate indicating the start point of the valid region
- * @param[in]  constant_value                    Constant value to use to fill the edges
- */
-__kernel void fill_image_borders_constant(
-    TENSOR3D_DECLARATION(buf),
-    uint      width,
-    uint      height,
-    int2      start_pos,
-    DATA_TYPE constant_value)
-{
-    Image buf = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(buf);
-
-    // Update pointer to point to the starting point of the valid region
-    buf.ptr += start_pos.y * buf.stride_y + start_pos.x * buf.stride_x;
-
-    const int total_width = BORDER_SIZE_LEFT + width + BORDER_SIZE_RIGHT;
-    const int gid0        = get_global_id(0);
-    const int gidH        = gid0 - total_width;
-    const int gidW        = gid0 - BORDER_SIZE_LEFT;
-
-    if(gidH >= 0)
-    {
-        // Handle left border
-        for(int i = -BORDER_SIZE_LEFT; i < 0; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, i, gidH) = constant_value;
-        }
-        // Handle right border
-        for(int i = 0; i < BORDER_SIZE_RIGHT; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, width + i, gidH) = constant_value;
-        }
-    }
-    else
-    {
-        // Handle top border
-        for(int i = -BORDER_SIZE_TOP; i < 0; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, gidW, i) = constant_value;
-        }
-        // Handle bottom border
-        for(int i = 0; i < BORDER_SIZE_BOTTOM; ++i)
-        {
-            *(__global DATA_TYPE *)offset(&buf, gidW, height + i) = constant_value;
-        }
-    }
-}
diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/floor.cl
deleted file mode 100644
index f6dd4edd2e..0000000000
--- a/src/core/CL/cl_kernels/floor.cl
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-/** Perform a floor operation on an input tensor.
- *
- * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
- * @note Can only take floating point data types.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void floor_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Offset computation
-    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-
-    // Address computation
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data0 = floor(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr));
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/gather.cl
deleted file mode 100644
index 41f439cb47..0000000000
--- a/src/core/CL/cl_kernels/gather.cl
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(AXIS)
-
-/** Performs the Gather operation along the chosen axis
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Axis should be given as a preprocessor argument using -DAXIS=axis. e.g. -DAXIS=1
- * @attention Output tensor depth should be given as a preprocessor argument using -DOUTPUT_DIM_Z=size. e.g. -DOUTPUT_DIM_Z=16
- * @attention Input tensor depth should be given as a preprocessor argument using -DINPUT_DIM_Z=size. e.g. -DINPUT_DIM_Z=16
- *
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per work item (in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per work item (in bytes)
- * @param[in]  input_offset_first_element_in_bytes   Offset of the first element in the source tensor
- * @param[in]  indices_ptr                           Pointer to the indices vector. Supported data types: S32/U32.
- * @param[in]  indices_stride_x                      Stride of the indices vector in X dimension (in bytes)
- * @param[in]  indices_step_x                        input_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  indices_offset_first_element_in_bytes Offset of the first element in the indices vector
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per work item (in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per work item (in bytes)
- * @param[in]  output_offset_first_element_in_bytes  Offset of the first element in the destination tensor
- */
-__kernel void gather(
-    TENSOR4D_DECLARATION(input),
-    VECTOR_DECLARATION(indices),
-    TENSOR4D_DECLARATION(output))
-{
-    const int px = get_global_id(0);
-    const int py = get_global_id(1);
-    const int pz = get_global_id(2) % OUTPUT_DIM_Z;
-    const int pw = get_global_id(2) / OUTPUT_DIM_Z;
-
-    const Tensor4D input   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, INPUT_DIM_Z);
-    const Vector   indices = CONVERT_TO_VECTOR_STRUCT_NO_STEP(indices);
-    Tensor4D       output  = CONVERT_TO_TENSOR4D_STRUCT(output, OUTPUT_DIM_Z);
-
-#if AXIS == 0
-    const uint index                 = *(__global const uint *)vector_offset(&indices, px);
-    __global const uchar *input_addr = tensor4D_offset(&input, index, py, pz, pw);
-#elif AXIS == 1
-    const uint index                 = *(__global const uint *)vector_offset(&indices, py);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, index, pz, pw);
-#elif AXIS == 2
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pz);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, index, pw);
-#elif AXIS == 3
-    const uint index                 = *(__global const uint *)vector_offset(&indices, pw);
-    __global const uchar *input_addr = tensor4D_offset(&input, px, py, pz, index);
-#endif //AXIS
-
-    *(__global DATA_TYPE *)output.ptr = *((__global const DATA_TYPE *)input_addr);
-}
-
-#endif //defined(DATA_TYPE) && defined(AXIS)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/gemm.cl
deleted file mode 100644
index 10435d376f..0000000000
--- a/src/core/CL/cl_kernels/gemm.cl
+++ /dev/null
@@ -1,4386 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
-#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
-#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
-#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
-#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
-#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
-#define CONCAT_INC(K0) INC##K0
-#define INC(K0) CONCAT_INC(K0)
-
-#if(SRC_WIDTH % K0)
-#define BOUNDARY_CONDITION_X(x, a)                                                                                                                   \
-    ({                                                                                                                                               \
-        a = select(0, a, CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), VEC_DATA_TYPE(DATA_TYPE, K0))); \
-    })
-#else // (SRC_WIDTH % K0)
-#define BOUNDARY_CONDITION_X(x, a) \
-    ({})
-#endif // (SRC_WIDTH % K0)
-
-#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin)                     \
-    ({                                                                                                           \
-        if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0)                                                    \
-        {                                                                                                        \
-            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \
-            }                                                                                                    \
-            else                                                                                                 \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
-            }                                                                                                    \
-        }                                                                                                        \
-        else                                                                                                     \
-        {                                                                                                        \
-            if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0))                                               \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);              \
-            }                                                                                                    \
-            else                                                                                                 \
-            {                                                                                                    \
-                LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);                           \
-            }                                                                                                    \
-        }                                                                                                        \
-    })
-
-/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
- * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
- * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
- * @note Only the following values for M0, K0 and V0 are supported:
- *                                      M0: 2,3,4,5,6,7,8
- *                                      K0: 2,3,4,8,16
- *                                      V0: greater than 0
- * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),
-                                         TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                         ,
-                                         uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                        )
-{
-    // Block size
-#define BLOCK_SIZE ((M0) * (K0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (K0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (K0) * (V0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (K0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
-                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
-
-    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-
-    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    input_ptr += z * (uint)src_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    output_ptr += z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    // Load values from the LHS matrix
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
-
-    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
-
-    // ---------------------------Store output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-
-#if M0 == 2
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 3 // M0 == 3
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 4 // M0 == 4
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
-    ({                                                                                            \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
-        VSTORE(M0)                                                                                \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
-    })
-#elif M0 == 5 // M0 == 5
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                      \
-    ({                                                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
-        res0           = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);           \
-        DATA_TYPE res1 = a4.s##i;                                                                     \
-        VSTORE(4)                                                                                     \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
-        *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
-    })
-#elif M0 == 6 // M0 == 6
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
-    ({                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
-        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
-        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
-        res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
-        VSTORE(4)                                                                                      \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
-        VSTORE(2)                                                                                      \
-        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
-    })
-#elif M0 == 7 // M0 == 7
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                       \
-    ({                                                                                                 \
-        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
-        res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
-        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
-        res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
-        VSTORE(4)                                                                                      \
-        (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
-        VSTORE(3)                                                                                      \
-        (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
-    })
-#elif M0 == 8 // M0 == 8
-#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                                      \
-    ({                                                                                                                \
-        VEC_DATA_TYPE(DATA_TYPE, M0)                                                                                  \
-        res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \
-        VSTORE(M0)                                                                                                    \
-        (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));                     \
-    })
-#else // M0 not supported
-#error "M0 value not supported"
-#endif // N0 conditions
-
-/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).
- * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)
- * @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)
- * @note Only the following values for M0, K0 and V0 are supported:
- *                                      M0: 2,3,4,5,6,7,8
- *                                      K0: 2,3,4,8,16
- *                                      V0: greater than 0
- * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- *
- * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- */
-__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),
-                                        TENSOR3D_DECLARATION(dst)
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                        ,
-                                        uint cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-                                       )
-{
-    // Block size
-#define BLOCK_SIZE ((M0) * (K0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (M0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (M0) * (V0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (M0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *
-                                 (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
-
-    // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src_stride_z by DEPTH_GEMM3D
-
-    input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    input_ptr += z * (uint)src_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    output_ptr += z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);
-
-    LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);
-
-    // ---------------------------Transpose and store block -----------------------
-
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
-#if K0 > 2
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
-#endif // K0 > 2
-#if K0 > 3
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
-#endif // K0 > 3
-#if K0 > 4
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
-#endif // K0 > 4
-#if K0 > 8
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
-    TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
-#endif // K0 > 8
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)
-
-#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
- * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- * @note Only the following values for K0, N0 and H0 are supported:
- *                                      N0: 2,3,4,8,16
- *                                      K0: 1,2,3,4,8,16
- *                                      H0: greater than 0
- *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),
-                                         TENSOR3D_DECLARATION(dst))
-{
-    // Block size
-#define BLOCK_SIZE ((K0) * (N0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (N0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (N0) * (H0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (N0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((
-                                     x / (uint)H0)
-                                 * (uint)dst_stride_y)
-                                 + z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-
-    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
-
-    // Load values from the RHS matrix
-    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-#if K0 > 1
-    if(y * (uint)K0 + 1 < SRC_HEIGHT)
-    {
-        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    }
-#endif // K0 > 1
-#if K0 > 2
-    if(y * (uint)K0 + 2 < SRC_HEIGHT)
-    {
-        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    }
-#endif // K0 > 2
-#if K0 > 3
-    if(y * (uint)K0 + 3 < SRC_HEIGHT)
-    {
-        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-    }
-#endif // K0 > 3
-#if K0 > 4
-    if(y * (uint)K0 + 4 < SRC_HEIGHT)
-    {
-        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
-    }
-    if(y * (uint)K0 + 5 < SRC_HEIGHT)
-    {
-        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
-    }
-    if(y * (uint)K0 + 6 < SRC_HEIGHT)
-    {
-        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
-    }
-    if(y * (uint)K0 + 7 < SRC_HEIGHT)
-    {
-        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
-    }
-#endif // K0 > 4
-#if K0 > 8
-    if(y * (uint)K0 + 8 < SRC_HEIGHT)
-    {
-        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
-    }
-    if(y * (uint)K0 + 9 < SRC_HEIGHT)
-    {
-        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
-    }
-    if(y * (uint)K0 + 10 < SRC_HEIGHT)
-    {
-        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
-    }
-    if(y * (uint)K0 + 11 < SRC_HEIGHT)
-    {
-        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
-    }
-    if(y * (uint)K0 + 12 < SRC_HEIGHT)
-    {
-        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
-    }
-    if(y * (uint)K0 + 13 < SRC_HEIGHT)
-    {
-        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
-    }
-    if(y * (uint)K0 + 14 < SRC_HEIGHT)
-    {
-        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
-    }
-    if(y * (uint)K0 + 15 < SRC_HEIGHT)
-    {
-        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
-    }
-#endif // K0 > 8
-
-    // ---------------------------Store output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-
-#if defined(TRANSPOSE)
-/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in
- *  the output matrix unrolling the values.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)
- * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).
- * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.
- * @note The option -DTRANSPOSE must passed at compile time.
- * @note Only the following values for K0, N0 and H0 are supported:
- *                                      N0: 2,3,4,8,16
- *                                      K0: 2,3,4,8,16
- *                                      H0: greater than 0
- *
- * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),
-                                        TENSOR3D_DECLARATION(dst))
-{
-    // Block size
-#define BLOCK_SIZE ((K0) * (N0))
-
-    // Output offset X
-#if defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (K0)
-#else // defined(INTERLEAVE)
-#define OUTPUT_OFFSET_X (BLOCK_SIZE)
-#endif // defined(INTERLEAVE)
-
-    // Output step X
-#if defined(INTERLEAVE)
-#define OUTPUT_STEP_X (K0) * (H0)
-#else // Do not interleave
-#define OUTPUT_STEP_X (K0)
-#endif // defined(INTERLEAVE)
-
-    // Compute source and destination addresses
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-    // ------------------ Compute input/output addresses ---------------------------
-
-    // Compute the input address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;
-
-    // Compute the output address
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /
-                                 (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
-
-    // ---------------------------Load input values --------------------------------
-    REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
-
-    // Load values from the RHS matrix
-    a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    if(y * (uint)K0 + 1 < SRC_HEIGHT)
-    {
-        a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    }
-#if K0 > 2
-    if(y * (uint)K0 + 2 < SRC_HEIGHT)
-    {
-        a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-    }
-#endif // K0 > 2
-#if K0 > 3
-    if(y * (uint)K0 + 3 < SRC_HEIGHT)
-    {
-        a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
-    }
-#endif // K0 > 3
-#if K0 > 4
-    if(y * (uint)K0 + 4 < SRC_HEIGHT)
-    {
-        a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
-    }
-    if(y * (uint)K0 + 5 < SRC_HEIGHT)
-    {
-        a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
-    }
-    if(y * (uint)K0 + 6 < SRC_HEIGHT)
-    {
-        a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
-    }
-    if(y * (uint)K0 + 7 < SRC_HEIGHT)
-    {
-        a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
-    }
-#endif // K0 > 4
-#if K0 > 8
-    if(y * (uint)K0 + 8 < SRC_HEIGHT)
-    {
-        a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
-    }
-    if(y * (uint)K0 + 9 < SRC_HEIGHT)
-    {
-        a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
-    }
-    if(y * (uint)K0 + 10 < SRC_HEIGHT)
-    {
-        aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
-    }
-    if(y * (uint)K0 + 11 < SRC_HEIGHT)
-    {
-        aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
-    }
-    if(y * (uint)K0 + 12 < SRC_HEIGHT)
-    {
-        aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
-    }
-    if(y * (uint)K0 + 13 < SRC_HEIGHT)
-    {
-        aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
-    }
-    if(y * (uint)K0 + 14 < SRC_HEIGHT)
-    {
-        aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
-    }
-    if(y * (uint)K0 + 15 < SRC_HEIGHT)
-    {
-        aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
-    }
-#endif // K0 > 8
-
-    // ---------------------------Transpose the block ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
-
-#if K0 == 2
-    // This part computes the following transpositions:
-    // 2x2 -> 2x2
-    // 2x4 -> 4x2
-    // 2x8 -> 8x2
-    // 2x16 -> 16x2
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
-#endif // N0 > 8
-
-#elif K0 == 3 // K0 == 2
-    // This part computes the following transpositions:
-    // 3x2 -> 2x3
-    // 3x4 -> 4x3
-    // 3x8 -> 8x3
-    // 3x16 -> 16x3
-    res0                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
-    res1                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
-#if N0 > 2
-    res2                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
-    res5                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
-    res6                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
-    res7                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
-    res9                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
-    resA                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
-    resB                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
-    resC                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
-    resD                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
-    resE                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
-    resF                      = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
-#endif // N0 > 8
-
-#elif K0 == 4 // K0 == 4
-    // This part computes the following transpositions:
-    // 4x2 -> 2x4
-    // 4x4 -> 4x4
-    // 4x8 -> 8x4
-    // 4x16 -> 16x4
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
-#endif // N0 > 8
-
-#elif K0 == 8 // K0 == 8
-    // This part computes the following transpositions:
-    // 8x2 -> 2x8
-    // 8x4 -> 4x8
-    // 8x8 -> 8x8
-    // 8x16 -> 16x8
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
-#endif // N0 > 8
-
-#elif K0 == 16 // K0 == 16
-
-    // This part computes the following transpositions:
-    // 16x2 -> 2x16
-    // 16x4 -> 4x16
-    // 16x8 -> 8x16
-    // 16x16 -> 16x16
-    res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
-                                          a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
-    res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
-                                          a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
-#if N0 > 2
-    res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
-                                          a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
-#endif // N0 > 2
-#if N0 > 3
-    res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
-                                          a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
-#endif // N0 > 3
-#if N0 > 4
-    res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
-                                          a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
-    res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
-                                          a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
-    res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
-                                          a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
-    res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
-                                          a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
-#endif // N0 > 4
-#if N0 > 8
-    res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
-                                          a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
-    res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
-                                          a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
-    resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
-                                          a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
-    resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
-                                          a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
-    resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
-                                          a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
-    resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
-                                          a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
-    resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
-                                          a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
-    resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
-                                          a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
-#endif // N0 > 8
-
-#else // N0 == 16
-#error "Not supported N0 value"
-#endif // N0 > 2
-
-    // ---------------------------Store the output values ------------------------------
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
-    STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
-
-#undef BLOCK_SIZE
-#undef OUTPUT_OFFSET_X
-#undef OUTPUT_STEP_X
-}
-#endif // defined(TRANSPOSE)
-#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
-
-#define CONCAT(a, b) a##b
-
-#define ARM_DOT1(a, b, c) \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-#define ARM_DOT2(a, b, c)       \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#define ARM_DOT3(a, b, c)           \
-    ({                              \
-        ARM_DOT2(a, b, c);          \
-        c = fma((a.s2), (b.s2), c); \
-    })
-#define ARM_DOT4(a, b, c)           \
-    ({                              \
-        ARM_DOT3(a, b, c);          \
-        c = fma((a.s3), (b.s3), c); \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(k0, a, b, c) \
-    ({                             \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##0), (c.s0));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##1), (c.s1));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##2), (c.s2));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##3), (c.s3));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##4), (c.s4));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##5), (c.s5));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##6), (c.s6));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##7), (c.s7));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##8), (c.s8));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##9), (c.s9));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##A), (c.sA));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##B), (c.sB));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##C), (c.sC));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##D), (c.sD));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##E), (c.sE));     \
-        CONCAT(ARM_DOT, k0)        \
-        ((a), (b##F), (c.sF));     \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- *
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),
-                                          IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                          IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                          IMAGE_DECLARATION(dst),
-                                          uint lhs_stride_z,
-                                          uint rhs_stride_z,
-#if defined(BETA)
-                                          uint bias_stride_z,
-#endif //defined(BETA)
-                                          uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                          ,
-                                          uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                          ,
-                                          uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                         )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS reshaped matrix
-        LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(1, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(1, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(1, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(1, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(1, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(1, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(1, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(1, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-
-#if defined(OPENCL_IMAGE_SUPPORT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image
- *  The LHS matrix is NOT reshaped
- *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- *
- * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
- *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
- *       could be different from the value returned by get_image_height(rhs_img).
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 4, 8, 16
- *  - K0 = 4, 8, 16
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),
-                                                  __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                  IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                  IMAGE_DECLARATION(dst),
-                                                  uint lhs_stride_z,
-                                                  uint rhs_stride_z,
-#if defined(BETA)
-                                                  uint bias_stride_z,
-#endif //defined(BETA)
-                                                  uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                  ,
-                                                  uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                  ,
-                                                  uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                 )
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-#define LEFTOVER_K (K % K0)
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(K0, a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(K0, a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(K0, a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(K0, a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(K0, a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(K0, a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(K0, a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(K0, a7, b, c7);
-#endif // M0 > 7
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-#if LEFTOVER_K != 0
-    // Note: We cannot read out-of-bound elements from the RHS matrix because
-    // the RHS width is always multiple of K0. This is not be true for the LHS matrix
-
-    union UNION_VEC_TYPE
-    {
-        DATA_TYPE s[K0];
-        VEC_DATA_TYPE(DATA_TYPE, K0)
-        v;
-    };
-
-    union UNION_VEC_TYPE a0 = {.v = 0 };
-#if M0 > 1
-    union UNION_VEC_TYPE a1 = {.v = 0 };
-#endif // M0 > 1
-#if M0 > 2
-    union UNION_VEC_TYPE a2 = {.v = 0 };
-#endif // M0 > 2
-#if M0 > 3
-    union UNION_VEC_TYPE a3 = {.v = 0 };
-#endif // M0 > 3
-#if M0 > 4
-    union UNION_VEC_TYPE a4 = {.v = 0 };
-#endif // M0 > 4
-#if M0 > 5
-    union UNION_VEC_TYPE a5 = {.v = 0 };
-#endif // M0 > 5
-#if M0 > 6
-    union UNION_VEC_TYPE a6 = {.v = 0 };
-#endif // M0 > 6
-#if M0 > 7
-    union UNION_VEC_TYPE a7 = {.v = 0 };
-#endif // M0 > 7
-
-    REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-
-    // Load from RHS matrix
-    LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-    // Load from LHS matrix
-    for(int k = 0; k < LEFTOVER_K; ++k)
-    {
-        a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);
-#if M0 > 1
-        a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);
-#endif // M0 > 1
-#if M0 > 2
-        a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);
-#endif // M0 > 2
-#if M0 > 3
-        a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);
-#endif // M0 > 3
-#if M0 > 4
-        a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);
-#endif // M0 > 4
-#if M0 > 5
-        a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);
-#endif // M0 > 5
-#if M0 > 6
-        a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);
-#endif // M0 > 6
-#if M0 > 7
-        a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);
-#endif // M0 > 7
-
-        lhs_offset += sizeof(DATA_TYPE);
-    }
-
-    // Accumulate
-    ARM_DOT_K0XN0(K0, a0.v, b, c0);
-#if M0 > 1
-    ARM_DOT_K0XN0(K0, a1.v, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-    ARM_DOT_K0XN0(K0, a2.v, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-    ARM_DOT_K0XN0(K0, a3.v, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-    ARM_DOT_K0XN0(K0, a4.v, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-    ARM_DOT_K0XN0(K0, a5.v, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-    ARM_DOT_K0XN0(K0, a6.v, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-    ARM_DOT_K0XN0(K0, a7.v, b, c7);
-#endif // M0 > 7
-
-#endif // LEFTOVER_K != 0
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LEFTOVER_K
-#undef PIXEL_UNIT
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define VFMA_M0xN0(i, a, b, c)                                        \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- *
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),
-                                           IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                           IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                           IMAGE_DECLARATION(dst),
-                                           uint lhs_stride_z,
-                                           uint rhs_stride_z,
-#if defined(BETA)
-                                           uint bias_stride_z,
-#endif //defined(BETA)
-                                           uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                           ,
-                                           uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                           ,
-                                           uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                          )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS reshaped matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   //uint zin0=0,zin1=0,zin2=0,... zin7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-
-#if defined(OPENCL_IMAGE_SUPPORT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
- *
- * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).
- * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
- *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
- *       could be different from the value returned by get_image_height(rhs_img).
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 4, 8, 16
- *  - K0 = 4, 8, 16
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),
-                                                   __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                   IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                   IMAGE_DECLARATION(dst),
-                                                   uint lhs_stride_z,
-                                                   uint rhs_stride_z,
-#if defined(BETA)
-                                                   uint bias_stride_z,
-#endif //defined(BETA)
-                                                   uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                   ,
-                                                   uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                   ,
-                                                   uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                  )
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(0, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(1, a, b0, c);
-#if K0 > 2
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(2, a, b0, c);
-#endif // K0 > 2
-#if K0 > 3
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(3, a, b0, c);
-#endif // K0 > 3
-#if K0 > 4
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(4, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(5, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(6, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(7, a, b0, c);
-#endif // K0 > 4
-#if K0 > 8
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(8, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(9, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(A, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(B, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(C, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(D, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(E, a, b0, c);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-        VFMA_M0xN0(F, a, b0, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        VFMA_M0xN0(0, a, b0, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        x_rhs += RHS_STEP_X;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
-
-#if defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c) \
-    ({                      \
-        c += a.s0 * b.s0;   \
-        c += a.s1 * b.s1;   \
-        c += a.s2 * b.s2;   \
-        c += a.s3 * b.s3;   \
-        c += a.s4 * b.s4;   \
-        c += a.s5 * b.s5;   \
-        c += a.s6 * b.s6;   \
-        c += a.s7 * b.s7;   \
-        c += a.s8 * b.s8;   \
-        c += a.s9 * b.s9;   \
-        c += a.sA * b.sA;   \
-        c += a.sB * b.sB;   \
-        c += a.sC * b.sC;   \
-        c += a.sD * b.sD;   \
-        c += a.sE * b.sE;   \
-        c += a.sF * b.sF;   \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#else  // defined(MIXED_PRECISION)
-#if K0 == 2
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-    })
-#elif K0 == 3 // K0 == 3
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-    })
-#elif K0 == 4 // K0 == 4
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-    })
-#elif K0 == 8 // K0 == 8
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-    })
-#elif K0 == 16 // K0 == 16
-#define ARM_DOT_K0(a, b, c)     \
-    ({                          \
-        c = fma(a.s0, b.s0, c); \
-        c = fma(a.s1, b.s1, c); \
-        c = fma(a.s2, b.s2, c); \
-        c = fma(a.s3, b.s3, c); \
-        c = fma(a.s4, b.s4, c); \
-        c = fma(a.s5, b.s5, c); \
-        c = fma(a.s6, b.s6, c); \
-        c = fma(a.s7, b.s7, c); \
-        c = fma(a.s8, b.s8, c); \
-        c = fma(a.s9, b.s9, c); \
-        c = fma(a.sA, b.sA, c); \
-        c = fma(a.sB, b.sB, c); \
-        c = fma(a.sC, b.sC, c); \
-        c = fma(a.sD, b.sD, c); \
-        c = fma(a.sE, b.sE, c); \
-        c = fma(a.sF, b.sF, c); \
-    })
-#else // K0 not supported
-#error "K0 value not supported"
-#endif // K0 conditions
-#endif // defined(MIXED_PRECISION)
-
-#if defined(ARM_DOT_K0XN0)
-#undef ARM_DOT_K0XN0
-#endif // defined(ARM_DOT_K0XN0)
-
-#if N0 == 2
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-    })
-#elif N0 == 3 // N0 == 3
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-    })
-#elif N0 == 4 // N0 == 4
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-    })
-#elif N0 == 8 // N0 == 8
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-    })
-#elif N0 == 16 // N0 == 16
-#define ARM_DOT_K0XN0(a, b, c)           \
-    ({                                   \
-        ARM_DOT_K0((a), (b##0), (c.s0)); \
-        ARM_DOT_K0((a), (b##1), (c.s1)); \
-        ARM_DOT_K0((a), (b##2), (c.s2)); \
-        ARM_DOT_K0((a), (b##3), (c.s3)); \
-        ARM_DOT_K0((a), (b##4), (c.s4)); \
-        ARM_DOT_K0((a), (b##5), (c.s5)); \
-        ARM_DOT_K0((a), (b##6), (c.s6)); \
-        ARM_DOT_K0((a), (b##7), (c.s7)); \
-        ARM_DOT_K0((a), (b##8), (c.s8)); \
-        ARM_DOT_K0((a), (b##9), (c.s9)); \
-        ARM_DOT_K0((a), (b##A), (c.sA)); \
-        ARM_DOT_K0((a), (b##B), (c.sB)); \
-        ARM_DOT_K0((a), (b##C), (c.sC)); \
-        ARM_DOT_K0((a), (b##D), (c.sD)); \
-        ARM_DOT_K0((a), (b##E), (c.sE)); \
-        ARM_DOT_K0((a), (b##F), (c.sF)); \
-    })
-#else // N0 not supported
-#error "N0 value not supported"
-#endif // N0 conditions
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
- * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
-                                            IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                            IMAGE_DECLARATION(dst),
-                                            uint k,
-                                            uint lhs_stride_z,
-                                            uint rhs_stride_z,
-#if defined(BETA)
-                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                            uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                            ,
-                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                           )
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += get_global_id(2) * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < k; i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-#endif // defined(MIXED_PRECISION)
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK(M0, c, bias_hp);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK(M0, c, bias);
-#endif // defined(MIXED_PRECISION)
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-#if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
-#else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#endif // defined(MIXED_PRECISION)
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-
-#if defined(OPENCL_IMAGE_SUPPORT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- *
- * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
- * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
- * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
- *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
- *       could be different from the value returned by get_image_height(rhs_img).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 4, 8, 16
- *  - K0 = 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
- * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_img                            The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),
-                                                    __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                    IMAGE_DECLARATION(dst),
-                                                    uint k,
-                                                    uint lhs_stride_z,
-                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                    uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                    ,
-                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                   )
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X (PIXEL_UNIT * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X PIXEL_UNIT
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +
-                               (get_global_id(2) * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = get_global_id(2);
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
-
-        // Load values from RHS matrix stored in a cl_image
-        REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);
-        LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);
-
-        // Accumulate
-        ARM_DOT_K0XN0(a0, b, c0);
-#if M0 > 1
-        ARM_DOT_K0XN0(a1, b, c1);
-#endif // M0 > 1
-#if M0 > 2
-        ARM_DOT_K0XN0(a2, b, c2);
-#endif // M0 > 2
-#if M0 > 3
-        ARM_DOT_K0XN0(a3, b, c3);
-#endif // M0 > 3
-#if M0 > 4
-        ARM_DOT_K0XN0(a4, b, c4);
-#endif // M0 > 4
-#if M0 > 5
-        ARM_DOT_K0XN0(a5, b, c5);
-#endif // M0 > 5
-#if M0 > 6
-        ARM_DOT_K0XN0(a6, b, c6);
-#endif // M0 > 6
-#if M0 > 7
-        ARM_DOT_K0XN0(a7, b, c7);
-#endif // M0 > 7
-
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
-
-        x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += get_global_id(2) * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-#endif // defined(MIXED_PRECISION)
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK(M0, c, bias_hp);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK(M0, c, bias);
-#endif // defined(MIXED_PRECISION)
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-#if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
-#else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#endif // defined(MIXED_PRECISION)
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-
-#if defined(LHS_TRANSPOSE)
-
-#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
-
-#if defined(MIXED_PRECISION)
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#else // defined(MIXED_PRECISION
-
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
-#else // GPU_ARCH == GPU_ARCH_MIDGARD
-#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-#endif // defined(MIXED_PRECISION)
-
-#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C)         \
-    ({                                                 \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \
-    })
-#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
-    })
-#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
-    })
-#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
-    })
-#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)            \
-    ({                                                    \
-        ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
-        ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
-    })
-
-// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1
-// a is the column-vector (transposed)
-// b is the row-vector (not transposed)
-// C is the output matrix
-// Lower case is a vector (a, b)
-// Upper case is a matrix (C)
-#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
-
-#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)             \
-    ({                                                         \
-        ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
-        ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
-    })
-#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)           \
-    ({                                                        \
-        ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
-        ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
-    })
-
-// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
-// The dimensions for this matrix multiplications are defined through M0, N0 and K0
-// The dimensions supported are:
-// M0: 1, 2, 3, 4, 8
-// N0: 1, 2, 3, 4, 8, 16
-// K0: 1, 2, 3, 4, 8, 16
-// This macro calls the vector-by-matrix macro K0 times
-// A, B and C are matrices
-#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
-    CONCAT(ARM_MM_T_NT_M0xN0x, K0)             \
-    (M0, N0, TYPE, A, B, C)
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- *
- * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS reshaped matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),
-                                            IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                                            IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                            IMAGE_DECLARATION(dst),
-                                            uint k,
-                                            uint lhs_stride_z,
-                                            uint rhs_stride_z,
-#if defined(BETA)
-                                            uint bias_stride_z,
-#endif //defined(BETA)
-                                            uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                            ,
-                                            uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                           )
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (N0)
-#define RHS_STEP_X ((N0) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (N0)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-    // Compute RHS matrix address
-    __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-    __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
-
-    for(int i = 0; i < k; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = VLOAD(N0)(0, rhs);
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-        rhs += RHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-#ifndef RHS_INTERLEAVE
-        rhs += (N0 * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-#endif // defined(MIXED_PRECISION)
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(
-                                    2) * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK(M0, c, bias_hp);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK(M0, c, bias);
-#endif // defined(MIXED_PRECISION)
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-#if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
-#else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#endif // defined(MIXED_PRECISION)
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-
-#if defined(OPENCL_IMAGE_SUPPORT)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed
- *
- * @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel
- * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).
- * @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)
- *       Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT
- *       could be different from the value returned by get_image_height(rhs_img).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 8
- *  - N0 = 4, 8, 16
- *  - K0 = 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data type: F32
- * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_img                            The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),
-                                                    __read_only image2d_t rhs_img,
-#if defined(BETA)
-                                                    IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                                                    IMAGE_DECLARATION(dst),
-                                                    uint k,
-                                                    uint lhs_stride_z,
-                                                    uint rhs_stride_z,
-#if defined(BETA)
-                                                    uint bias_stride_z,
-#endif //defined(BETA)
-                                                    uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                    ,
-                                                    uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                   )
-{
-    // Pixel unit
-#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)
-
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (M0)
-#define LHS_STEP_X ((M0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (M0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (PIXEL_UNIT)
-#define RHS_STEP_X ((PIXEL_UNIT) * (H0))
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (PIXEL_UNIT)
-#endif // defined(RHS_INTERLEAVE)
-
-    const uint x = get_global_id(0);
-    const uint y = get_global_id(1);
-    const uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    const uint z_rhs = (z % MATRIX_B_DEPTH);
-#else  // defined(MATRIX_B_DEPTH)
-    const uint z_rhs = z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Compute RHS matrix coordinates
-    uint       x_rhs = (x % H0) * (uint)RHS_OFFSET_X;
-    const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-    __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
-
-    for(int i = 0; i < K; i += K0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, M0)
-        a0;
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b0;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-#if K0 > 1
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 1
-
-#if K0 > 2
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 2
-
-#if K0 > 3
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 3
-
-#if K0 > 4
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 4
-
-#if K0 > 8
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-
-        a0 = VLOAD(M0)(0, lhs);
-        b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));
-
-        ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
-
-        lhs += LHS_STEP_X;
-#endif // K0 > 8
-
-#ifndef LHS_INTERLEAVE
-        lhs += (M0 * K0 * (V0 - 1));
-#endif // LHS_INTERLEAVE
-
-        x_rhs += K0 * RHS_STEP_X;
-#ifndef RHS_INTERLEAVE
-        x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));
-#endif // RHS_INTERLEAVE
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-#endif // defined(MIXED_PRECISION)
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
-    ADD_BLOCK(M0, c, bias_hp);
-#else  // defined(MIXED_PRECISION)
-    ADD_BLOCK(M0, c, bias);
-#endif // defined(MIXED_PRECISION)
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-#if defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, VEC_SIZE, c, A_VAL, B_VAL);
-#else  // defined(MIXED_PRECISION)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(MIXED_PRECISION)
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-#if defined(MIXED_PRECISION)
-    CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#else  // defined(MIXED_PRECISION)
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-#endif // defined(MIXED_PRECISION)
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef PIXEL_UNIT
-#undef LHS_STEP_LOOP
-#undef RHS_STEP_LOOP
-}
-#endif // defined(OPENCL_IMAGE_SUPPORT)
-
-#endif // defined(LHS_TRANSPOSE)
-
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
-
-#define VFMA(a, b, c)     \
-    ({                    \
-        c = fma(a, b, c); \
-    })
-
-#if M0 == 1
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-    })
-#elif M0 == 2 // M0 == 2
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-    })
-#elif M0 == 3 // M0 == 3
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-    })
-#elif M0 == 4 // M0 == 4
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-    })
-#elif M0 == 5 // M0 == 5
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-    })
-#elif M0 == 6 // M0 == 6
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-    })
-#elif M0 == 7 // M0 == 7
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-    })
-#elif M0 == 8 // M0 == 8
-#define RHS_VFMA_M0xN0(i, a, b, c)                                    \
-    ({                                                                \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
-        VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
-    })
-#else // M0 not supported
-#error "M0 not supported"
-#endif // M0 not supported
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS matrix is NOT reshaped
- *
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
- * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)
- * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type: F16/F32
- * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
- * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
- * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
- * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),
-                             IMAGE_DECLARATION(rhs),
-#if defined(BETA)
-                             IMAGE_DECLARATION(bias),
-#endif // defined(BETA)
-                             IMAGE_DECLARATION(dst),
-                             uint lhs_stride_z,
-                             uint rhs_stride_z,
-#if defined(BETA)
-                             uint bias_stride_z,
-#endif //defined(BETA)
-                             uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                             ,
-                             uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                             ,
-                             uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                            )
-{
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        // Supported cases (M0, K0):
-        // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
-        // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
-        // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
-        // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
-        // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
-        // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
-        // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
-        // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
-
-        RHS_VFMA_M0xN0(0, a, b0, c);
-        RHS_VFMA_M0xN0(1, a, b1, c);
-#if K0 > 2
-        RHS_VFMA_M0xN0(2, a, b2, c);
-#endif // K0 > 2
-#if K0 > 3
-        RHS_VFMA_M0xN0(3, a, b3, c);
-#endif // K0 > 3
-#if K0 > 4
-        RHS_VFMA_M0xN0(4, a, b4, c);
-        RHS_VFMA_M0xN0(5, a, b5, c);
-        RHS_VFMA_M0xN0(6, a, b6, c);
-        RHS_VFMA_M0xN0(7, a, b7, c);
-#endif // K0 > 4
-#if K0 > 8
-        RHS_VFMA_M0xN0(8, a, b8, c);
-        RHS_VFMA_M0xN0(9, a, b9, c);
-        RHS_VFMA_M0xN0(A, a, bA, c);
-        RHS_VFMA_M0xN0(B, a, bB, c);
-        RHS_VFMA_M0xN0(C, a, bC, c);
-        RHS_VFMA_M0xN0(D, a, bD, c);
-        RHS_VFMA_M0xN0(E, a, bE, c);
-        RHS_VFMA_M0xN0(F, a, bF, c);
-#endif // K0 > 8
-
-        lhs_offset += K0 * sizeof(DATA_TYPE);
-        rhs_offset += K0 * rhs_stride_y;
-    }
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
-#endif // M0 > 3
-#if M0 > 4
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
-#endif // M0 > 4
-#if M0 > 5
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
-#endif // M0 > 5
-#if M0 > 6
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
-#endif // M0 > 6
-#if M0 > 7
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
-#endif // M0 > 7
-
-        VEC_DATA_TYPE(DATA_TYPE, N0)
-        b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
-        RHS_VFMA_M0xN0(0, a, b, c);
-
-        lhs_offset += sizeof(DATA_TYPE);
-        rhs_offset += rhs_stride_y;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-#if defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
-
-#if defined(BETA)
-/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note The beta's value need to be passed at compile time using -DBETA
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),
-                          TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    // Load values from A x B
-    float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
-
-    // Load values from Matrix C
-    float4 c = vload4(0, (__global float *)src.ptr);
-
-    // Computes alpha * axb + beta * c
-    float4 out = alpha_ab + (float4)BETA * c;
-
-    // Store final result in axb matrix
-    vstore4(out, 0, (__global float *)dst.ptr);
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:
- *
- * @note The beta's value need to be passed at compile time using -DBETA
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),
-                          TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    // Load values from A x B
-    half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
-
-    // Load values from Matrix C
-    half8 c = vload8(0, (__global half *)src.ptr);
-
-    // Computes alpha * axb + beta * c
-    half8 out = alpha_ab + (half8)BETA * c;
-
-    // Store final result in axb matrix
-    vstore8(out, 0, (__global half *)dst.ptr);
-}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-#endif // defined(BETA)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/gemm_v1.cl
deleted file mode 100644
index a136a1b96b..0000000000
--- a/src/core/CL/cl_kernels/gemm_v1.cl
+++ /dev/null
@@ -1,3243 +0,0 @@
-/*
- * Copyright (c) 2020-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "gemm_helpers.h"
-#include "repeat.h"
-
-#if defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) && defined(IN1_DIM_X)
-/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 4;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global float *src_end_addr_b = src_addr_b + IN1_DIM_X;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float4 c0 = 0.0f;
-    float4 c1 = 0.0f;
-    float4 c2 = 0.0f;
-    float4 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(8 * H0)); src_addr_a += 8 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * V0);
-        b0 = vload4(0, src_addr_b + 4 * H0);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 4 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        c0 += (float4)a0.s0 * b0;
-        c1 += (float4)a0.s1 * b0;
-        c2 += (float4)a0.s2 * b0;
-        c3 += (float4)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x4 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                         IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                         IMAGE_DECLARATION(dst),
-                                                         uint src0_stride_z,
-                                                         uint src1_stride_z,
-#if defined(BETA)
-                                                         uint src2_stride_z,
-#endif //defined(BETA)
-                                                         uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                         ,
-                                                         uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                        )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 4;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
-    __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float4 c0 = 0.0f;
-    float4 c1 = 0.0f;
-    float4 c2 = 0.0f;
-    float4 c3 = 0.0f;
-
-    int i = 0;
-    for(; i <= (int)(K - 4); i += 4)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = vload4(0, src_addr_a);
-        float4 b0 = vload4(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 4 * H0;
-
-        c0.s0 = fma(a0.s0, b0.s0, c0.s0);
-        c0.s1 = fma(a0.s0, b0.s1, c0.s1);
-        c0.s2 = fma(a0.s0, b0.s2, c0.s2);
-        c0.s3 = fma(a0.s0, b0.s3, c0.s3);
-
-        c1.s0 = fma(a0.s1, b0.s0, c1.s0);
-        c1.s1 = fma(a0.s1, b0.s1, c1.s1);
-        c1.s2 = fma(a0.s1, b0.s2, c1.s2);
-        c1.s3 = fma(a0.s1, b0.s3, c1.s3);
-
-        c2.s0 = fma(a0.s2, b0.s0, c2.s0);
-        c2.s1 = fma(a0.s2, b0.s1, c2.s1);
-        c2.s2 = fma(a0.s2, b0.s2, c2.s2);
-        c2.s3 = fma(a0.s2, b0.s3, c2.s3);
-
-        c3.s0 = fma(a0.s3, b0.s0, c3.s0);
-        c3.s1 = fma(a0.s3, b0.s1, c3.s1);
-        c3.s2 = fma(a0.s3, b0.s2, c3.s2);
-        c3.s3 = fma(a0.s3, b0.s3, c3.s3);
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x4 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 4, float, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + IN1_DIM_X;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    half8 c0 = 0.0f;
-    half8 c1 = 0.0f;
-    half8 c2 = 0.0f;
-    half8 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a + 4 * V0);
-        b0 = vload8(0, src_addr_b + 8 * H0);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        c0 += (half8)a0.s0 * b0;
-        c1 += (half8)a0.s1 * b0;
-        c2 += (half8)a0.s2 * b0;
-        c3 += (half8)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, half, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The number of columns of the reshaped rhs matrix must be passed at compile time using -DIN1_DIM_X
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
-                                                       IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                       IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                       IMAGE_DECLARATION(dst),
-                                                       uint src0_stride_z,
-                                                       uint src1_stride_z,
-#if defined(BETA)
-                                                       uint src2_stride_z,
-#endif //defined(BETA)
-                                                       uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                       ,
-                                                       uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                      )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    // Compute end row address for matrix B
-    __global half *src_end_addr_b = src_addr_b + IN1_DIM_X;
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    float8 c0 = 0.0f;
-    float8 c1 = 0.0f;
-    float8 c2 = 0.0f;
-    float8 c3 = 0.0f;
-
-    for(; src_addr_b <= (src_end_addr_b - (int)(16 * H0)); src_addr_a += 8 * V0, src_addr_b += 16 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = convert_float4(vload4(0, src_addr_a));
-        float8 b0 = convert_float8(vload8(0, src_addr_b));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = convert_float4(vload4(0, src_addr_a + 4 * V0));
-        b0 = convert_float8(vload8(0, src_addr_b + 8 * H0));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-    }
-
-    for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * V0, src_addr_b += 8 * H0)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        float4 a0 = convert_float4(vload4(0, src_addr_a));
-        float8 b0 = convert_float8(vload8(0, src_addr_b));
-
-        c0 += (float8)a0.s0 * b0;
-        c1 += (float8)a0.s1 * b0;
-        c2 += (float8)a0.s2 * b0;
-        c3 += (float8)a0.s3 * b0;
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, float, c, ALPHA);
-#endif // defined(ALPHA)
-
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias_f0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-    float8 bias_f1 = convert_float8(bias1);
-    float8 bias_f2 = convert_float8(bias2);
-    float8 bias_f3 = convert_float8(bias3);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias_f);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    half8 c_h0 = convert_half8(c0);
-    half8 c_h1 = convert_half8(c1);
-    half8 c_h2 = convert_half8(c2);
-    half8 c_h3 = convert_half8(c3);
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c_h, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)
- *
- * @note The number of rows of destination matrix must be passed at compile time using -DM
- * @note The number of columns of the destination matrix must be passed at compile time using -DN
- * @note The number of rows of the *un-reshaped* matrix B (K) must be passed at compile time using -DK
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note The multiplication factor for the transposition width (H0) must be passed at compile time using -DH0 (e.g. -DH0=2)
- * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DV0 (e.g. -DV0=2)
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
-                                                         IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                         IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                         IMAGE_DECLARATION(dst),
-                                                         uint src0_stride_z,
-                                                         uint src1_stride_z,
-#if defined(BETA)
-                                                         uint src2_stride_z,
-#endif //defined(BETA)
-                                                         uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                         ,
-                                                         uint cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                        )
-{
-    int x = get_global_id(0) / H0;
-    int y = get_global_id(1) / V0;
-    int z = get_global_id(2);
-
-    // Offset
-    const int offset_row_a = (get_global_id(1) % V0) * 4;
-    const int offset_row_b = (get_global_id(0) % H0) * 8;
-
-    // src_addr_a = address of matrix A
-    // src_addr_b = address of matrix B
-    int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
-    int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src1_addr_in_bytes += z * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
-    __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
-
-    src_addr_a += offset_row_a;
-    src_addr_b += offset_row_b;
-
-    // Reset accumulators
-    half8 c0 = 0.0f;
-    half8 c1 = 0.0f;
-    half8 c2 = 0.0f;
-    half8 c3 = 0.0f;
-
-    int i = 0;
-    for(; i <= (int)(K - 4); i += 4)
-    {
-#if V0 == 1
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half8 a0 = vload8(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 8 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix B (transposed)
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s4, b0, c0);
-        c1 = fma((half8)a0.s5, b0, c1);
-        c2 = fma((half8)a0.s6, b0, c2);
-        c3 = fma((half8)a0.s7, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload8(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 8 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix B (transposed)
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s4, b0, c0);
-        c1 = fma((half8)a0.s5, b0, c1);
-        c2 = fma((half8)a0.s6, b0, c2);
-        c3 = fma((half8)a0.s7, b0, c3);
-#else  // V0 == 1
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        a0 = vload4(0, src_addr_a);
-        b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-#endif // V0 == 1
-    }
-
-    for(; i < (int)K; ++i)
-    {
-        // Load values from matrix A (interleaved) and matrix B (transposed)
-        half4 a0 = vload4(0, src_addr_a);
-        half8 b0 = vload8(0, src_addr_b);
-
-        src_addr_a += 4 * V0;
-        src_addr_b += 8 * H0;
-
-        c0 = fma((half8)a0.s0, b0, c0);
-        c1 = fma((half8)a0.s1, b0, c1);
-        c2 = fma((half8)a0.s2, b0, c2);
-        c3 = fma((half8)a0.s3, b0, c3);
-    }
-
-    // Compute destination address
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    // Compute dst address
-    __global uchar *dst_addr = offset(&dst, 0, 0);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(4, half, c, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(4, c, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(
-                                    2) * src2_stride_z;
-
-    LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(4, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(4, c, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, VEC_SIZE, c, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store 4x8 block
-    const bool cond_y = ((get_global_id(1) + 1) * 4 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(4, 8, half, c, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#endif // defined(M) && defined(N) && defined(K) && defined(H0) && defined(V0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0) && defined(IN1_DIM_X)
-
-#if defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-#if defined(DATA_TYPE)
-#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, N0)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.
- *
- * @note This OpenCL kernel works with floating point data types (F16/F32)
- * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16/F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),
-                                     IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                     IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                     IMAGE_DECLARATION(dst),
-                                     uint src0_stride_z,
-                                     uint src1_stride_z,
-#if defined(BETA)
-                                     uint src2_stride_z,
-#endif //defined(BETA)
-                                     uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                     ,
-                                     uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                     ,
-                                     uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                    )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(DATA_TYPE);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    int end_row_vec_a = src_addr.s0 + (K * sizeof(DATA_TYPE));
-
-    VECTOR_TYPE acc0 = 0.0f;
-#if M0 > 1
-    VECTOR_TYPE acc1 = 0.0f;
-#endif // M0 > 1
-#if M0 > 2
-    VECTOR_TYPE acc2 = 0.0f;
-#endif // M0 > 2
-#if M0 > 3
-    VECTOR_TYPE acc3 = 0.0f;
-#endif // M0 > 3
-
-    for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        VEC_DATA_TYPE(DATA_TYPE, 2)
-        a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-        VECTOR_TYPE b1 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
-
-        // Accumulate
-        acc0 += b0 * (VECTOR_TYPE)a0.s0;
-        acc0 += b1 * (VECTOR_TYPE)a0.s1;
-#if M0 > 1
-        acc1 += b0 * (VECTOR_TYPE)a1.s0;
-        acc1 += b1 * (VECTOR_TYPE)a1.s1;
-#endif // M0 > 1
-#if M0 > 2
-        acc2 += b0 * (VECTOR_TYPE)a2.s0;
-        acc2 += b1 * (VECTOR_TYPE)a2.s1;
-#endif // M0 > 2
-#if M0 > 3
-        acc3 += b0 * (VECTOR_TYPE)a3.s0;
-        acc3 += b1 * (VECTOR_TYPE)a3.s1;
-#endif // M0 > 3
-    }
-
-    for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        VECTOR_TYPE b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
-
-        // Accumulate
-        acc0 += b0 * (VECTOR_TYPE)a0;
-#if M0 > 1
-        acc1 += b0 * (VECTOR_TYPE)a1;
-#endif // M0 > 1
-#if M0 > 2
-        acc2 += b0 * (VECTOR_TYPE)a2;
-#endif // M0 > 2
-#if M0 > 3
-        acc3 += b0 * (VECTOR_TYPE)a3;
-#endif // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0)
-                               * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, DATA_TYPE, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
-
-    LOAD_BLOCK(1, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, N0, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
-#endif // UNIT_BIAS
-
-    // c = c + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(DATA_TYPE)
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=4.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                 ,
-                                                 uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for matrix B
-    src_addr.s1 += idx * sizeof(float);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize accumulators
-    float4 acc0 = 0.0f;
-
-#if M0 > 1
-    float4 acc1 = 0.0f;
-#endif // M0 > 1
-
-#if M0 > 2
-    float4 acc2 = 0.0f;
-#endif // M0 > 2
-
-#if M0 > 3
-    float4 acc3 = 0.0f;
-#endif // M0 > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A and matrix B
-        LOAD_BLOCK(M0, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A and matrix B
-        float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        // Load values from matrix A and matrix B
-        b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
-
-#if M0 > 1
-
-        acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
-
-#endif // M0 > 1
-#if M0 > 2
-
-        acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
-
-#endif // M0 > 2
-#if M0 > 3
-
-        acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(float);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0, b0.s1, acc0.s1);
-        acc0.s2 = fma(a0, b0.s2, acc0.s2);
-        acc0.s3 = fma(a0, b0.s3, acc0.s3);
-#if M0 > 1
-        acc1.s0 = fma(a1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1, b0.s1, acc1.s1);
-        acc1.s2 = fma(a1, b0.s2, acc1.s2);
-        acc1.s3 = fma(a1, b0.s3, acc1.s3);
-#endif // M0 > 1
-#if M0 > 2
-        acc2.s0 = fma(a2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2, b0.s1, acc2.s1);
-        acc2.s2 = fma(a2, b0.s2, acc2.s2);
-        acc2.s3 = fma(a2, b0.s3, acc2.s3);
-#endif // M0 > 2
-#if M0 > 3
-        acc3.s0 = fma(a3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3, b0.s1, acc3.s1);
-        acc3.s2 = fma(a3, b0.s2, acc3.s2);
-        acc3.s3 = fma(a3, b0.s3, acc3.s3);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float);
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0)
-                               * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
-
-    LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 4 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 4, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped
- *
- * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.
- * This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=2.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F32
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
-                                                      IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                      IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                      IMAGE_DECLARATION(dst),
-                                                      uint src0_stride_z,
-                                                      uint src1_stride_z,
-#if defined(BETA)
-                                                      uint src2_stride_z,
-#endif //defined(BETA)
-                                                      uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                      ,
-                                                      uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                      ,
-                                                      uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                     )
-{
-    // Requires 2 N0, C vect2, A vect4, B (2 vload2) // to fix for M0 > 1
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(float);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize accumulators
-    float2 acc0 = 0.0f;
-#if M0 > 1
-    float2 acc1 = 0.0f;
-#endif // M0 > 1
-#if M0 > 2
-    float2 acc2 = 0.0f;
-#endif // M0 > 2
-#if M0 > 3
-    float2 acc3 = 0.0f;
-#endif // M0 > 3
-
-    // A and B src indices get incremented at the same time.
-    int i = 0;
-    for(; i <= ((int)K - 8); i += 8)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
-        acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
-        acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
-        acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
-        acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
-        acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
-        acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
-        acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
-
-        acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
-        acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
-        acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
-        acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
-        acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
-        acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
-        acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
-        acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
-
-#if M0 > 1
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
-        acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
-        acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
-        acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
-        acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
-        acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
-        acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
-        acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
-
-        acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
-        acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
-        acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
-        acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
-        acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
-        acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
-        acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
-        acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
-#endif // M0 > 1
-#if M0 > 2
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
-        acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
-        acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
-        acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
-        acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
-        acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
-        acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
-        acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
-
-        acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
-        acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
-        acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
-        acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
-        acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
-        acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
-        acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
-        acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
-#endif // M0 > 2
-#if M0 > 3
-#if defined(REINTERPRET_INPUT_AS_3D)
-        a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        a0                    = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-        acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
-        acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
-        acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
-        acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
-        acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
-        acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
-        acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
-        acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
-
-        acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
-        acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
-        acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
-        acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
-        acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
-        acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
-        acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
-        acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float) * 8;
-    }
-    // float size increment
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Multiply and accumulate
-        acc0.s0 = fma(a0, b0.s0, acc0.s0);
-        acc0.s1 = fma(a0, b0.s1, acc0.s1);
-#if M0 > 1
-        acc1.s0 = fma(a1, b0.s0, acc1.s0);
-        acc1.s1 = fma(a1, b0.s1, acc1.s1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2.s0 = fma(a2, b0.s0, acc2.s0);
-        acc2.s1 = fma(a2, b0.s1, acc2.s1);
-#endif // M0 > 2
-#if M0 > 3
-        acc3.s0 = fma(a3, b0.s0, acc3.s0);
-        acc3.s1 = fma(a3, b0.s1, acc3.s1);
-#endif // M0 > 3
-
-        src_addr.s0 += sizeof(float);
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                               PARTIAL_STORE_M0)
-                               * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
-
-    LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, float, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 2 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 2, float, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=8.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
-                                                       IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                       IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                       IMAGE_DECLARATION(dst),
-                                                       uint src0_stride_z,
-                                                       uint src1_stride_z,
-#if defined(BETA)
-                                                       uint src2_stride_z,
-#endif //defined(BETA)
-                                                       uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                       ,
-                                                       uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                       ,
-                                                       uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                      )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(half);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    float8 acc0 = 0.0h;
-#if M0 > 1
-    float8 acc1 = 0.0h;
-#endif // M0 > 1
-#if M0 > 2
-    float8 acc2 = 0.0h;
-#endif // M0 > 2
-#if M0 > 3
-    float8 acc3 = 0.0h;
-#endif // M0 > 3
-
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-
-        // Accumulate
-        acc0 = fma(b0, (float8)a0.s0, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s0, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s0, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s0, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s1, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s1, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s1, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s1, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s2, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s2, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s2, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s2, acc3);
-#endif // M0 > 3
-
-        b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (float8)a0.s3, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1.s3, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2.s3, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3.s3, acc3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(half);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
-
-        src_addr += (int2)(sizeof(half), src1_stride_y);
-
-        // Accumulate
-        acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
-#if M0 > 1
-        acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
-#endif                                    // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
-#endif                                    // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
-#endif                                    // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, float, acc, ALPHA);
-#endif // defined(ALPHA)
-
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias_f0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-    float8 bias_f0 = convert_float8(bias0);
-#if M0 > 1
-    float8 bias_f1 = convert_float8(bias1);
-#endif // M0 > 1
-#if M0 > 2
-    float8 bias_f2 = convert_float8(bias2);
-#endif // M0 > 2
-#if M0 > 3
-    float8 bias_f3 = convert_float8(bias3);
-#endif // M0 > 3
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, float, bias_f, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias_f);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-    half8 acc_h0 = convert_half8(acc0);
-#if M0 > 1
-    half8 acc_h1 = convert_half8(acc1);
-#endif // M0 > 1
-#if M0 > 2
-    half8 acc_h2 = convert_half8(acc2);
-#endif // M0 > 2
-#if M0 > 3
-    half8 acc_h3 = convert_half8(acc3);
-#endif // M0 > 3
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc_h, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc_h, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-
-/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped
- *
- * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.
- * @note The number of elements processed along the x and y directions must be passed at compile time using -DN0 and -DM0.
- * @note This kernel processed a fixed number of elements along x: -DN0=8.
- * @note The number of columns of matrix A and the number of columns of the matrix B need to be passed at compile time using -DK and -DN
- * @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)
- * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)
- * @note The optional alpha's value need to be passed at compile time using -DALPHA
- * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)
- *       This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
- *
- * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.
- *       The activation function is performed after the bias addition
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
- *
- * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data types: F16
- * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src0_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src0_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data types: same as @p src0_ptr
- * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src1_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src1_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X dimension (in bytes)
- * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y dimension (in bytes)
- * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
- * @param[out] dst_ptr                            Pointer to the destination matrix Supported data types: same as @p src0_ptr
- * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination matrix
- * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in bytes)
- * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),
-                                                 IMAGE_DECLARATION(src1),
-#if defined(BETA)
-                                                 IMAGE_DECLARATION(src2),
-#endif // defined(BETA)
-                                                 IMAGE_DECLARATION(dst),
-                                                 uint src0_stride_z,
-                                                 uint src1_stride_z,
-#if defined(BETA)
-                                                 uint src2_stride_z,
-#endif //defined(BETA)
-                                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                                 ,
-                                                 uint src_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                 ,
-                                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                                )
-{
-    int idx = get_global_id(0) * N0;
-
-    // Compute starting address for matrix A and Matrix B
-    int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
-
-    // Update address for the matrix A
-    src_addr.s0 += COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * src0_stride_y;
-
-    // Update address for the matrix B
-    src_addr.s1 += idx * sizeof(half);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zin) is calculated dividing row by HEIGHT_GEMM3D
-    uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zin       = min(DEPTH_GEMM3D - 1, zin);
-
-    // Add offset due to the cross plane paddings
-    zin *= (src_cross_plane_pad * src0_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply src0_stride_z by DEPTH_GEMM3D
-    src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    src_addr.s0 += get_global_id(2) * src0_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    src_addr.s1 += get_global_id(2) * src1_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    half8 acc0 = 0.0h;
-#if M0 > 1
-    half8 acc1 = 0.0h;
-#endif // M0 > 1
-#if M0 > 2
-    half8 acc2 = 0.0h;
-#endif // M0 > 2
-#if M0 > 3
-    half8 acc3 = 0.0h;
-#endif // M0 > 3
-
-    int i = 0;
-    for(; i <= ((int)K - 4); i += 4)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        LOAD_BLOCK(M0, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);
-#else // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-
-        // Accumulate
-        acc0 = fma(b0, (half8)a0.s0, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s0, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s0, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s0, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s1, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s1, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s1, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s1, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s2, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s2, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s2, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s2, acc3);
-#endif // M0 > 3
-
-        b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-        src_addr.s1 += src1_stride_y;
-        acc0 = fma(b0, (half8)a0.s3, acc0);
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1.s3, acc1);
-#endif // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2.s3, acc2);
-#endif // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3.s3, acc3);
-#endif // M0 > 3
-
-        src_addr.s0 += 4 * sizeof(half);
-    }
-
-    for(; i < (int)K; ++i)
-    {
-#if defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
-#endif // M0 > 3
-#else  // defined(REINTERPRET_INPUT_AS_3D)
-        // Load values from matrix A
-        half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
-#if M0 > 1
-        half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
-#endif // M0 > 1
-#if M0 > 2
-        half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
-#endif // M0 > 2
-#if M0 > 3
-        half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
-#endif // M0 > 3
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-        // Load values from matrix B
-        half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
-
-        src_addr += (int2)(sizeof(half), src1_stride_y);
-
-        // Accumulate
-        acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
-#if M0 > 1
-        acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
-#endif                                   // M0 > 1
-#if M0 > 2
-        acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
-#endif                                   // M0 > 2
-#if M0 > 3
-        acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
-#endif                                   // M0 > 3
-    }
-
-    int z = get_global_id(2);
-
-    // Compute dst address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    uint4 zout = 0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension
-    // in order to take into account the presence of possible cross plane paddings
-    //
-    //  |                  |
-    //  |      plane0      |
-    //  |                  |
-    //  |__________________|
-    //  |******************|
-    //  |  cross_plane_pad |
-    //  |******************|
-    //  |                  |
-    //  |      plane1      |
-    //  |                  |
-    //  |__________________|
-
-    // The plane (zout) is calculated dividing row by HEIGHT_GEMM3D
-    zout = ((uint4)(0, 1, 2, 3) + (uint4)(COMPUTE_M0_START_ROW(get_global_id(1), M0, PARTIAL_STORE_M0))) / (uint4)HEIGHT_GEMM3D;
-    zout = min(DEPTH_GEMM3D - 1, zout);
-
-    // Add offset due to the cross plane paddings
-    zout *= (dst_cross_plane_pad * dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-#else  // defined(REINTERPRET_OUTPUT_AS_3D)
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Multiply by the weight of matrix-matrix product and store the result
-#if defined(ALPHA)
-    SCALE_BLOCK(M0, half, acc, ALPHA);
-#endif // defined(ALPHA)
-
-    // Add beta*bias
-#if defined(BETA)
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
-
-#if defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
-
-    LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(1, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias[broadcasted]
-    ADD_BLOCK_BROADCAST(M0, acc, bias0);
-
-#else // defined(BROADCAST_BIAS)
-    __global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (COMPUTE_M0_START_ROW(get_global_id(1), M0,
-                                PARTIAL_STORE_M0)
-                                * src2_stride_y)
-                                + z * src2_stride_z;
-
-    LOAD_BLOCK(M0, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
-
-#ifndef UNIT_BETA
-    SCALE_BLOCK(M0, half, bias, BETA);
-#endif // UNIT_BIAS
-
-    // acc = acc + bias
-    ADD_BLOCK(M0, acc, bias);
-
-#endif // defined(BROADCAST_BIAS)
-#endif // defined(BETA)
-
-#if defined(ACTIVATION_TYPE)
-    ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, half, VEC_SIZE, acc, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    // Store the output block
-    const bool cond_y = get_global_id(1) == 0;
-    const bool cond_x = ((get_global_id(0) + 1) * 8 >= N);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, 8, half, acc, dst_addr, dst_stride_y, zout.s, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
-
-#endif // defined(N) && defined(K) && defined(M0) && defined(N0) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
deleted file mode 100644
index 5cafb5389c..0000000000
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ /dev/null
@@ -1,2160 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "gemm_helpers.h"
-#include "helpers_asymm.h"
-#include "repeat.h"
-#include "tile_helpers.h"
-
-#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-#define ARM_DOT1(a, b, c)                                                                                                                               \
-    ({                                                                                                                                                  \
-        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
-    })
-#define ARM_DOT2(a, b, c)                                                                                                                               \
-    ({                                                                                                                                                  \
-        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
-    })
-#define ARM_DOT3(a, b, c)                                                                                           \
-    ({                                                                                                              \
-        ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0), (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
-    })
-#define ARM_DOT4(a, b, c) \
-    ({                    \
-        ARM_DOT(a, b, c); \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-
-#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16] without using the dot8 instruction. */
-#define ARM_DOT1(a, b, c)          \
-    ({                             \
-        c += (ACC_DATA_TYPE)a * b; \
-    })
-#define ARM_DOT2(a, b, c)                \
-    ({                                   \
-        c += (ACC_DATA_TYPE)a.s0 * b.s0; \
-        c += (ACC_DATA_TYPE)a.s1 * b.s1; \
-    })
-#define ARM_DOT3(a, b, c)                \
-    ({                                   \
-        ARM_DOT2(a, b, c);               \
-        c += (ACC_DATA_TYPE)a.s2 * b.s2; \
-    })
-#define ARM_DOT4(a, b, c)                \
-    ({                                   \
-        ARM_DOT3(a, b, c);               \
-        c += (ACC_DATA_TYPE)a.s3 * b.s3; \
-    })
-#define ARM_DOT8(a, b, c)            \
-    ({                               \
-        ARM_DOT4((a.lo), (b.lo), c); \
-        ARM_DOT4((a.hi), (b.hi), c); \
-    })
-#define ARM_DOT16(a, b, c)           \
-    ({                               \
-        ARM_DOT8((a.lo), (b.lo), c); \
-        ARM_DOT8((a.hi), (b.hi), c); \
-    })
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-
-/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
-#define ARM_DOT_K0X1(k0, a, b, c)         \
-    ({                                    \
-        ARM_DOT_K0(k0, (a), (b##0), (c)); \
-    })
-#define ARM_DOT_K0X2(k0, a, b, c)            \
-    ({                                       \
-        ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
-        ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
-    })
-#define ARM_DOT_K0X3(k0, a, b, c)            \
-    ({                                       \
-        ARM_DOT_K0X2(k0, a, b, c);           \
-        ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
-    })
-#define ARM_DOT_K0X4(k0, a, b, c)            \
-    ({                                       \
-        ARM_DOT_K0X3(k0, a, b, c);           \
-        ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
-    })
-#define ARM_DOT_K0X8(k0, a, b, c)            \
-    ({                                       \
-        ARM_DOT_K0X4(k0, a, b, c);           \
-        ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
-        ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
-        ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
-        ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
-    })
-#define ARM_DOT_K0X16(k0, a, b, c)           \
-    ({                                       \
-        ARM_DOT_K0X8(k0, a, b, c);           \
-        ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
-        ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
-        ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
-        ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
-        ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
-        ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
-        ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
-        ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
-    })
-
-/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
-#define ARM_MM_K0XN0X1(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); \
-    })
-#define ARM_MM_K0XN0X2(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
-    })
-#define ARM_MM_K0XN0X3(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
-    })
-#define ARM_MM_K0XN0X4(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
-    })
-#define ARM_MM_K0XN0X5(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
-    })
-#define ARM_MM_K0XN0X6(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
-    })
-#define ARM_MM_K0XN0X7(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
-    })
-#define ARM_MM_K0XN0X8(n0, k0, a, b, c)           \
-    ({                                            \
-        ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
-        ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
-    })
-
-#define ARM_DOT_K0(k0, a, b, c) \
-    ({                          \
-        CONCAT(ARM_DOT, k0)     \
-        ((a), (b), (c));        \
-    })
-
-#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
-    ({                                 \
-        CONCAT(ARM_DOT_K0X, n0)        \
-        (k0, (a), b, (c));             \
-    })
-
-#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
-    ({                                       \
-        CONCAT(ARM_MM_K0XN0X, m0)            \
-        (n0, k0, a, b, c);                   \
-    })
-
-/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0 vectors "b" of size K0 [1,16] */
-#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c)   \
-    ({                                           \
-        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; \
-    })
-#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)        \
-    ({                                                \
-        c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
-        c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
-    })
-#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)        \
-    ({                                                \
-        ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
-        c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
-    })
-#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)        \
-    ({                                                \
-        ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
-        c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
-    })
-#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)        \
-    ({                                                \
-        ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
-        c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
-        c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
-        c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
-        c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
-    })
-#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)       \
-    ({                                                \
-        ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
-        c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
-        c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
-        c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
-        c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
-        c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
-        c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
-        c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
-        c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
-    })
-/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
-#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6)); \
-    })
-#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c)    \
-    ({                                                         \
-        ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);   \
-        ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7)); \
-    })
-#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
-    ({                                              \
-        CONCAT(ARM_MUL_N0X, k0)                     \
-        (VECTOR_ACC_TYPE, (a), b, (c));             \
-    })
-#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
-    ({                                                           \
-        CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                         \
-        (VECTOR_ACC_TYPE, k0, a, b, c);                          \
-    })
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM/QASYMM_SIGNED data type.
- *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed
- *  The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
- * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).
- * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
- * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
- * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - V0 >= 1
- *  - H0 >= 1
- *
- * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
- *
- * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM_SIGNED
- * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS matrix not reshaped.
- * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),
-                                                IMAGE_DECLARATION(rhs),
-                                                IMAGE_DECLARATION(dst),
-                                                uint k,
-                                                uint lhs_stride_z,
-                                                uint rhs_stride_z,
-                                                uint dst_stride_z
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                                ,
-                                                uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                               )
-{
-    // Block size
-#define LHS_BLOCK_SIZE ((K0) * (M0))
-
-#if defined(LHS_INTERLEAVE)
-#define LHS_OFFSET_X (K0)
-#define LHS_STEP_X ((K0) * (V0))
-#define LHS_STEP_LOOP (1)
-#else // defined(INTERLEAVE)
-#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
-#define LHS_STEP_X (K0)
-#define LHS_STEP_LOOP (V0)
-#endif // defined(INTERLEAVE)
-
-    // Block size
-#define RHS_BLOCK_SIZE ((K0) * (N0))
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X ((K0) * (H0))
-#define RHS_STEP_LOOP (1)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
-#define RHS_STEP_X (K0)
-#define RHS_STEP_LOOP (H0)
-#endif // defined(RHS_INTERLEAVE)
-
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    __global DATA_TYPE *lhs_addr = (__global DATA_TYPE *)(lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z));
-
-    // Compute RHS matrix address
-    __global DATA_TYPE *rhs_addr = (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_addr += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    for(int i = 0; i < k; i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
-
-        // Partial matrix multiplication M0,N0,K0
-        ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
-
-        // Update address
-        lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
-        rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, y * M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Convert and store output block
-    const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);
-    const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);
-
-    // Store output block
-    REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_lp);
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-
-#undef LHS_BLOCK_SIZE
-#undef LHS_OFFSET_X
-#undef LHS_STEP_X
-#undef RHS_BLOCK_SIZE
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-}
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
-#define FUSED_OUTPUT_STAGE_FIXED_POINT
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage using fixed-point arithmetic.
- *  The LHS matrix is NOT reshaped
- *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
- * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
- * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
- * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *  - H0 >= 1
- *
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed at compile time.
- *
- * @param[in]  lhs_ptr                                          Pointer to the LHS reshaped matrix. Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  lhs_stride_x                                     Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                                     Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes                The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                                          Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                                     Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                                       src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                                     Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                                       src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes                The offset of the first element in the RHS reshaped matrix
- * @param[out] dst_ptr                                          Pointer to the destination matrix Supported data type: same as @p lhs_ptr
- * @param[in]  dst_stride_x                                     Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                                       dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                     Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                                       dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                                     Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                                     Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                                     Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad                              (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad                              (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
- * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: S32
- * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: S32
- * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
- * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
- * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
- * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
- */
-#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint
-#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-__kernel void gemmlowp_mm_reshaped_only_rhs_t
-#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-(IMAGE_DECLARATION(lhs),
- IMAGE_DECLARATION(rhs),
- IMAGE_DECLARATION(dst),
- uint lhs_stride_z,
- uint rhs_stride_z,
- uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
- ,
- uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
- ,
- uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-#if defined(A_OFFSET)
- ,
- IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
- ,
- IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
- ,
- VECTOR_DECLARATION(biases)
-#endif // defined(ADD_BIAS)
-#if defined(PER_CHANNEL_QUANTIZATION)
- ,
- VECTOR_DECLARATION(result_multipliers),
- VECTOR_DECLARATION(result_shifts)
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-)
-{
-    // @note: replace with (DIMENSION + PAD) once we pass the relevant info at compile time
-#define FULL_LHS_HEIGHT (lhs_stride_z / lhs_stride_y)
-#define FULL_DST_HEIGHT (dst_stride_z / dst_stride_y)
-
-    // RHS offset and step X
-#if defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0)
-#define RHS_STEP_X (K0 * H0)
-#else // defined(RHS_INTERLEAVE)
-#define RHS_OFFSET_X (K0 * N0)
-#define RHS_STEP_X (K0)
-#endif // defined(RHS_INTERLEAVE)
-#define RHS_STEP_LOOP (N0 * K0 * H0)
-
-    uint x  = GET_SPATIAL_IDX(0, 1, 1);
-    uint y  = GET_SPATIAL_IDX(1, M0, PARTIAL_STORE_M0);
-    uint z  = GET_SPATIAL_IDX(2, 1, 1);
-    int  xo = (x * N0);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((xo >= N) || (y >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_y = y + z * FULL_LHS_HEIGHT;
-
-    // Compute RHS matrix address
-    uint rhs_offset_x = (x % H0) * RHS_OFFSET_X;
-    uint rhs_offset_y = (x / H0) * rhs_stride_y;
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset_y += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset_y += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    // Initialize the accumulators
-    TILE(ACC_DATA_TYPE, M0, N0, c);
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        c[i].v = 0;
-    })
-
-    int i = 0;
-    for(; i <= (K - K0); i += K0)
-    {
-        TILE(DATA_TYPE, M0, K0, a);
-        TILE(DATA_TYPE, N0, K0, b);
-
-        // Load values from LHS matrix
-        T_LOAD(DATA_TYPE, M0, K0, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
-
-        // // Load values from RHS matrix
-        LOOP_UNROLLING(int, _i, 0, 1, N0,
-        {
-            b[_i].v = VLOAD(K0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X));
-        })
-
-        // Partial matrix multiplication M0,N0,K0
-        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
-
-        rhs_offset_x += RHS_STEP_LOOP;
-    }
-
-#if((K % K0) != 0)
-
-    // Left-over accumulations
-    for(; i < K; ++i)
-    {
-        TILE(DATA_TYPE, M0, 1, a);
-        TILE(DATA_TYPE, N0, 1, b);
-
-        // Load values from LHS matrix
-        T_LOAD(DATA_TYPE, M0, 1, BUFFER, lhs, i, lhs_y, 1, lhs_stride_y, a);
-
-        LOOP_UNROLLING(int, _i, 0, 1, N0,
-        {
-            b[_i].v = *(__global DATA_TYPE *)(rhs_ptr + rhs_offset_first_element_in_bytes + rhs_offset_x + rhs_offset_y + _i * RHS_STEP_X);
-        })
-
-        T_MMUL(DATA_TYPE, DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
-
-        rhs_offset_x += 1;
-    }
-#endif // ((K % K0) != 0)
-
-#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-
-    TILE(int, M0, N0, c_int);
-    TILE(int, M0, N0, offset_s32);
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        offset_s32[i].v = (VEC_DATA_TYPE(int, N0))K_OFFSET;
-    })
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        c_int[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
-    })
-
-#if defined(A_OFFSET)
-
-#if defined(SUM_COL_HAS_BATCHES)
-    int sum_col_y = z;
-#else  // defined(SUM_COL_HAS_BATCHES)
-    int sum_col_y = 0;
-#endif // defined(SUM_COL_HAS_BATCHES)
-    TILE(int, 1, N0, a_offset_s32);
-
-    T_LOAD(int, 1, N0, BUFFER, sum_col, xo, sum_col_y, 1, sum_col_stride_y, a_offset_s32);
-
-    a_offset_s32[0].v *= A_OFFSET;
-
-    T_ADD_BROADCAST_X(int, M0, 1, offset_s32, a_offset_s32, offset_s32);
-#endif // defined(A_OFFSET)
-
-#if defined(B_OFFSET)
-    // Compute the offset contribution due to B_OFFSET
-    // Note: The sum_row tensor is generated through CLGEMMLowpMatrixAReductionKernel which
-    // does not introduce paddings. For this reason is safe to access the tensor in this manner
-    // without considering that the coordinate "y" could come from an input 3D tensor
-    TILE(int, M0, N0, b_offset_s32);
-
-    T_LOAD(int, M0, 1, BUFFER, sum_row, y + z * (sum_row_stride_y / sizeof(int)), 0, 1, sum_row_stride_x, b_offset_s32);
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        offset_s32[i].v += b_offset_s32[i].v *B_OFFSET;
-    })
-
-#endif // defined(B_OFFSET)
-
-#if defined(ADD_BIAS)
-
-    TILE(int, 1, N0, bias);
-
-    T_LOAD(int, 1, N0, BUFFER, biases, xo, 0, 1, 0, bias);
-
-    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, 1, offset_s32, bias, offset_s32);
-#endif // defined(ADD_BIAS)
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        c_int[i].v += offset_s32[i].v;
-    })
-
-    TILE(DATA_TYPE, M0, N0, c_lp);
-
-    // Multiply by result_mult_int and shift
-#if defined(PER_CHANNEL_QUANTIZATION)
-    TILE(int, 1, N0, res_mul);
-    TILE(int, 1, N0, res_shift);
-
-    T_LOAD(int, 1, N0, BUFFER, result_multipliers, xo, 0, 0, 0, res_mul);
-    T_LOAD(int, 1, N0, BUFFER, result_shifts, xo, 0, 0, 0, res_shift);
-
-    T_QUANTIZE8(int, DATA_TYPE, PER_CHANNEL, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, res_mul, res_shift, c_lp);
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    T_QUANTIZE8(int, DATA_TYPE, PER_TENSOR, M0, N0, RESULT_OFFSET, RESULT_SHIFT, RESULT_MULTIPLIER, c_int, 0, 0, c_lp);
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-#if defined(MIN_BOUND)
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        c_lp[i].v = max(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MIN_BOUND);
-    })
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        c_lp[i].v = min(c_lp[i].v, (VEC_DATA_TYPE(DATA_TYPE, N0))MAX_BOUND);
-    })
-#endif // defined(MAX_BOUND)
-
-#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-    TILE(int, M0, N0, c_lp);
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-        c_lp[i].v = CONVERT_SAT(c[i].v, VEC_DATA_TYPE(int, N0));
-    })
-#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-
-    TILE(uint, M0, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, M0,
-    {
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-        dst_indirect_y[i].v = (uint)min((int)((y + i) % HEIGHT_GEMM3D), (int)HEIGHT_GEMM3D - 1);
-        dst_indirect_y[i].v += (uint)min((int)((y + i) / HEIGHT_GEMM3D), (int)DEPTH_GEMM3D - 1) * FULL_DST_HEIGHT;
-        dst_indirect_y[i].v += z *FULL_DST_HEIGHT *DEPTH_GEMM3D;
-#else  // (REINTERPRET_OUTPUT_AS_3D)
-        dst_indirect_y[i].v = (uint)min((int)y + i, (int)M - 1) + z *FULL_DST_HEIGHT;
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-    })
-
-    const bool cond_x = (xo > (N - N0));
-
-#if defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
-#else  // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-    T_STORE_INDIRECT_WIDTH_SELECT(int, M0, N0, PARTIAL_STORE_N0, BUFFER, dst, xo, dst_stride_y, cond_x, c_lp, dst_indirect_y);
-#endif // defined(FUSED_OUTPUT_STAGE_FIXED_POINT)
-
-#undef RHS_OFFSET_X
-#undef RHS_STEP_X
-#undef RHS_STEP_LOOP
-}
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
- *  The LHS matrix is NOT reshaped
- *  The RHS matrix is NOT reshaped
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
- * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
- * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
- * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)
- * @note Only the following configurations of M0, N0 and K0 are currently supported:
- *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
- *  - N0 = 2, 3, 4, 8, 16
- *  - K0 = 2, 3, 4, 8, 16
- *
- * @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:
- *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
- *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
- *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.
- *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
- *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
- *
- * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data type: QASYMM8
- * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension (in bytes)
- * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix
- * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr
- * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension (in bytes)
- * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension (in bytes)
- * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension (in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
- * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
- */
-__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),
-                                 IMAGE_DECLARATION(rhs),
-                                 IMAGE_DECLARATION(dst),
-                                 uint lhs_stride_z,
-                                 uint rhs_stride_z,
-                                 uint dst_stride_z
-#if defined(REINTERPRET_INPUT_AS_3D)
-                                 ,
-                                 uint lhs_cross_plane_pad
-#endif // REINTERPRET_INPUT_AS_3D
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-                                 ,
-                                 uint dst_cross_plane_pad
-#endif // REINTERPRET_OUTPUT_AS_3D
-                                )
-{
-    uint x = get_global_id(0);
-    uint y = get_global_id(1);
-    uint z = get_global_id(2);
-
-#if defined(DUMMY_WORK_ITEMS)
-    if((x * N0 >= N) || (y * M0 >= M))
-    {
-        return;
-    }
-#endif // defined(DUMMY_WORK_ITEMS)
-
-    // Compute LHS matrix address
-    uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;
-
-    // Compute RHS matrix address
-    uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
-
-#if defined(MATRIX_B_DEPTH)
-    // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
-    rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
-#else  // defined(MATRIX_B_DEPTH)
-    rhs_offset += z * rhs_stride_z;
-#endif // defined(MATRIX_B_DEPTH)
-
-    REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
-    REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
-
-#if defined(REINTERPRET_INPUT_AS_3D)
-    // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply lhs_stride_z by DEPTH_GEMM3D
-    lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    lhs_offset += z * lhs_stride_z;
-
-#endif // defined(REINTERPRET_INPUT_AS_3D)
-
-    // Initialize the accumulators
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
-
-    int i = 0;
-
-    for(; i <= (K - K0); i += K0)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
-
-        // Partial matrix multiplication M0,N0,K0
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
-#else  // GPU_ARCH == GPU_ARCH_MIDGARD
-        // Transpose the values from RHS matrix
-        TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
-
-        ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-        // Update the offset
-        lhs_offset += K0;
-        rhs_offset += K0 * rhs_stride_y;
-    }
-
-    // Left-over for loop
-    for(; i < K; ++i)
-    {
-        // Load values from LHS matrix
-        LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
-
-        // Load values from RHS matrix
-        LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
-
-        // Partial matrix multiplication M0,N0,1
-#if(GPU_ARCH == GPU_ARCH_MIDGARD)
-        ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
-#else  // GPU_ARCH == GPU_ARCH_MIDGARD
-        // Transpose the values from RHS matrix
-        TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
-
-        ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
-#endif // GPU_ARCH == GPU_ARCH_MIDGARD
-
-        // Update the offset
-        lhs_offset += 1;
-        rhs_offset += rhs_stride_y;
-    }
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);
-
-    REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-
-#if defined(REINTERPRET_OUTPUT_AS_3D)
-    // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
-    CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);
-
-    // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
-    // multiply dst_stride_z by DEPTH_GEMM3D
-    dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
-
-#else // defined(REINTERPRET_OUTPUT_AS_3D)
-
-    // Add offset for batched GEMM
-    dst_addr += z * dst_stride_z;
-
-#endif // defined(REINTERPRET_OUTPUT_AS_3D)
-    const bool cond_y = y == 0;
-    const bool cond_x = ((x + 1) * N0 >= N);
-
-    // Convert and store output block
-    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
-}
-#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
-
-#if defined(COLS_A)
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
- * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- *
- * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),
-                                          IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    sum_row_32            = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
-    ACC_DATA_TYPE sum_row = 0;
-
-    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
-
-    int i = 0;
-
-    // This for loop performs 16 accumulations
-    for(; i <= ((int)COLS_A - 16); i += 16)
-    {
-        const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
-
-        sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) + CONVERT(a0.sCDEF,
-                      VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
-    }
-
-    // This for loop performs the leftover accumulations
-    for(; i < COLS_A; ++i)
-    {
-        sum_row += (ACC_DATA_TYPE)matrix_a[i];
-    }
-
-    sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
-
-#if defined(SCALAR)
-    sum_row *= (int)SCALAR;
-#endif // defined(SCALAR)
-    *((__global int *)dst.ptr) = (int)sum_row;
-}
-
-#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction.
- * It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at compile time.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- *
- * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g. -DSCALAR=3)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),
-                                               IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Image    dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    ACC_DATA_TYPE sum_row = 0;
-
-    __global const DATA_TYPE *matrix_a = (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);
-
-    int i = 0;
-
-    // This for loop performs 16 accumulations
-    for(; i <= ((int)COLS_A - 32); i += 32)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 16)
-        a0 = vload16(0, matrix_a + i);
-
-        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-
-        a0 = vload16(1, matrix_a + i);
-
-        sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-        sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
-    }
-
-    // This for loop performs the leftover accumulations
-    for(; i < COLS_A; ++i)
-    {
-        sum_row += (ACC_DATA_TYPE)matrix_a[i];
-    }
-
-#if defined(SCALAR)
-    sum_row *= (int)SCALAR;
-#endif // defined(SCALAR)
-    *((__global int *)dst.ptr) = (int)sum_row;
-}
-#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
-#endif // defined(COLS_A)
-
-#if defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
- * It is also possible to multiply each reduced column by a scalar value, if SCALAR is passed at compile time.
- *
- * @note This stage is needed to handle the offset of matrix product
- *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
- *
- * @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B
- * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e. -DDATA_TYPE=uchar)
- * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE (i.e. -DACC_DATA_TYPE=uint)
- * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e. -DSCALAR=3)
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor Supported data type: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),
-                                          IMAGE_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const uint y      = get_global_id(1);
-
-    __global const DATA_TYPE *matrix_b = (__global const DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + y * src_step_y + y * src_stride_z);
-    __global uchar *dst_addr           = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + y * dst_stride_y;
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))0;
-
-    int i = 0;
-    // This for loop performs 4 accumulations
-    for(; i <= ((int)ROWS_B - 4); i += 4)
-    {
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b0 = VLOAD(VEC_SIZE)(0, matrix_b + 0 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b1 = VLOAD(VEC_SIZE)(0, matrix_b + 1 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b2 = VLOAD(VEC_SIZE)(0, matrix_b + 2 * src_stride_y);
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b3 = VLOAD(VEC_SIZE)(0, matrix_b + 3 * src_stride_y);
-
-        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)) + CONVERT(b3,
-                      VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-
-        matrix_b += 4 * src_stride_y;
-    }
-
-    // This for loop perfoms the leftover accumulations
-    for(; i < (int)ROWS_B; ++i)
-    {
-        const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        b0 = VLOAD(VEC_SIZE)(0, matrix_b);
-
-        sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-
-        matrix_b += src_stride_y;
-    }
-
-#if defined(SCALAR)
-    sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))SCALAR;
-#endif // defined(SCALAR)
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    res0 = CONVERT(sum_col_32, VEC_DATA_TYPE(int, VEC_SIZE));
-
-    STORE_VECTOR_SELECT(res, int, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(COLS_B) && defined(ROWS_B) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
-
-#if defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-
-/* Helper function used to calculate the offset contribution after matrix multiplication.
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
- * and calculates the offset contribution of matrix A and matrix B.
- *
- * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
- * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
- * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
- * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in] x                                     max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0)
- * @param[in] y                                     get_global_id(1)
- * @param[in] z                                     get_global_id(2)
- * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
- * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor
- * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in the biases tensor
- */
-inline VEC_INT offset_contribution(
-    int x,
-    int y,
-    int z
-#if defined(A_OFFSET)
-    ,
-    IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-    ,
-    IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-    ,
-    VECTOR_DECLARATION(biases)
-#endif // defined(ADD_BIAS)
-)
-{
-    VEC_INT a_offset_s32 = (VEC_INT)0;
-    VEC_INT b_offset_s32 = (VEC_INT)0;
-
-    int batch_id = z;
-#if defined(DEPTH_INPUT3D)
-    batch_id /= (int)DEPTH_INPUT3D;
-#endif // defined(DEPTH_INPUT3D)
-
-#if defined(A_OFFSET)
-    // Compute the offset contribution due to A_OFFSET
-    __global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
-
-    // Compute the offset contribution due to A_OFFSET
-#if defined(SUM_COL_HAS_BATCHES)
-    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
-#else  // defined(SUM_COL_HAS_BATCHES)
-    a_offset_s32 = VLOAD(VEC_SIZE)(0, (__global int *)sum_col_addr);
-#endif // defined(SUM_COL_HAS_BATCHES)
-
-    a_offset_s32 *= (VEC_INT)A_OFFSET;
-#endif // defined(A_OFFSET)
-
-#if defined(B_OFFSET)
-    // Compute the offset contribution due to A_OFFSET
-    __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
-
-    // Compute the offset contribution due to B_OFFSET
-#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
-#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 = (VEC_INT) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
-#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
-    b_offset_s32 *= (VEC_INT)B_OFFSET;
-#endif // defined(B_OFFSET)
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_INT biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    b_offset_s32 += (VEC_INT)biases_values;
-#endif // defined(ADD_BIAS)
-
-    return (VEC_INT)K_OFFSET + a_offset_s32 + b_offset_s32;
-}
-
-/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is performed in-place
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
- * and adds to it the offset contribution of matrix A and matrix B in-place.
- *
- * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
- * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
- * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
- * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * The final result is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (sum_col[k] * A_OFFSET) +
- *                   (sum_row[i] * B_OFFSET) +
- *                   (K_OFFSET)
- *
- * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data type: S32
- * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
- * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in the source tensor
- * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in] biases_step_x                           (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in the biases tensor
- */
-__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
-#if defined(A_OFFSET)
-                                           ,
-                                           IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                           ,
-                                           IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                           ,
-                                           VECTOR_DECLARATION(biases)
-#endif // defined(ADD_BIAS))
-                                          )
-{
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-
-    // Compute offset contribution
-    VEC_INT offset_term_s32 = offset_contribution(
-                                  x, y, z
-#if defined(A_OFFSET)
-                                  ,
-                                  sum_col_ptr,
-                                  sum_col_stride_x,
-                                  sum_col_step_x,
-                                  sum_col_stride_y,
-                                  sum_col_step_y,
-                                  sum_col_offset_first_element_in_bytes
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                  ,
-                                  sum_row_ptr,
-                                  sum_row_stride_x,
-                                  sum_row_step_x,
-                                  sum_row_stride_y,
-                                  sum_row_step_y,
-                                  sum_row_offset_first_element_in_bytes
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                  ,
-                                  biases_ptr,
-                                  biases_stride_x,
-                                  biases_step_x,
-                                  biases_offset_first_element_in_bytes
-#endif // defined(ADD_BIAS)
-                              );
-
-    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
-
-    VEC_INT in_s32_0 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
-
-    // Add the offset terms to GEMM's result
-    in_s32_0 += offset_term_s32;
-
-    // Store the result with the offset contribution
-    STORE_VECTOR_SELECT(in_s32_, int, mm_result_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
-/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.
- *
- * This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
- *
- *
- * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
- * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
- * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
- * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
- *
- * The result before the output stage is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (sum_col[k] * A_OFFSET) +
- *                   (sum_row[i] * B_OFFSET) +
- *                   (K_OFFSET)
- *
- * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
- *  -# Clamp the resulting int32 values:
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
- * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
- * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
- * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
- * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
- * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
- */
-__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
-#if defined(A_OFFSET)
-                                                         ,
-                                                         IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                                         ,
-                                                         IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-                                                         ,
-#if defined(ADD_BIAS)
-                                                         VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                         TENSOR3D_DECLARATION(dst)
-#if defined(PER_CHANNEL_QUANTIZATION)
-                                                         ,
-                                                         VECTOR_DECLARATION(result_multipliers),
-                                                         VECTOR_DECLARATION(result_shifts)
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-                                                        )
-{
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    // Compute offset contribution
-    VEC_INT offset_term_s32 = offset_contribution(
-                                  x, y, z
-#if defined(A_OFFSET)
-                                  ,
-                                  sum_col_ptr,
-                                  sum_col_stride_x,
-                                  sum_col_step_x,
-                                  sum_col_stride_y,
-                                  sum_col_step_y,
-                                  sum_col_offset_first_element_in_bytes
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                  ,
-                                  sum_row_ptr,
-                                  sum_row_stride_x,
-                                  sum_row_step_x,
-                                  sum_row_stride_y,
-                                  sum_row_step_y,
-                                  sum_row_offset_first_element_in_bytes
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                  ,
-                                  biases_ptr,
-                                  biases_stride_x,
-                                  biases_step_x,
-                                  biases_offset_first_element_in_bytes
-#endif // defined(ADD_BIAS)
-                              );
-
-    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
-
-    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
-
-    // Add the offset terms to GEMM's result
-    in_s32 += offset_term_s32;
-
-    // -------------- OUTPUT STAGE
-
-    // Add the offset terms to GEMM's result
-    in_s32 += (VEC_INT)RESULT_OFFSET;
-
-    // Multiply by result_mult_int and shift
-#if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
-    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
-    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
-    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
-
-    in_s32 *= result_multipliers_values;
-    in_s32 >>= result_shifts_values;
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    in_s32 *= RESULT_MULTIPLIER;
-
-    in_s32 >>= RESULT_SHIFT;
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes down to uint8.
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.
- *
- *
- * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
- * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
- * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
- * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
- *
- * The result before the output stage is:
- *
- * mm_result[i][k] = mm_result[i][k] +
- *                   (sum_col[k] * A_OFFSET) +
- *                   (sum_row[i] * B_OFFSET) +
- *                   (K_OFFSET)
- *
- * This result is quantized down to uint8/int8 using the output stage. The output stage computes the following operations:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  mm_result_ptr                                    Pointer to the source tensor. Supported data type: S32
- * @param[in]  mm_result_stride_x                               Stride of the source tensor in X dimension (in bytes)
- * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in the source tensor
- * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr
- * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first element in the source tensor
- * @param[in]  biases_ptr                                       (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                                  (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                                          Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                                     Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                                     Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                                     Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                                       src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in the destination tensor
- * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output multipliers vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output multipliers vector in X dimension (in bytes)
- * @param[in]  result_multipliers_step_x                        (Optional) output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first element in the output multipliers vector
- * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output shifts vector for per-channel quantization. Supported data types: S32
- * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output shifts vector in X dimension (in bytes)
- * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first element in the output shifts vector
- */
-__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
-#if defined(A_OFFSET)
-                                                                    ,
-                                                                    IMAGE_DECLARATION(sum_col)
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                                                    ,
-                                                                    IMAGE_DECLARATION(sum_row)
-#endif // defined(B_OFFSET)
-                                                                    ,
-#if defined(ADD_BIAS)
-                                                                    VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                                    TENSOR3D_DECLARATION(dst)
-#if defined(PER_CHANNEL_QUANTIZATION)
-                                                                    ,
-                                                                    VECTOR_DECLARATION(result_multipliers),
-                                                                    VECTOR_DECLARATION(result_shifts)
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-                                                                   )
-{
-    const int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-
-    // Compute offset contribution
-    VEC_INT offset_term_s32 = offset_contribution(
-                                  x, y, z
-#if defined(A_OFFSET)
-                                  ,
-                                  sum_col_ptr,
-                                  sum_col_stride_x,
-                                  sum_col_step_x,
-                                  sum_col_stride_y,
-                                  sum_col_step_y,
-                                  sum_col_offset_first_element_in_bytes
-#endif // defined(A_OFFSET)
-#if defined(B_OFFSET)
-                                  ,
-                                  sum_row_ptr,
-                                  sum_row_stride_x,
-                                  sum_row_step_x,
-                                  sum_row_stride_y,
-                                  sum_row_step_y,
-                                  sum_row_offset_first_element_in_bytes
-#endif // defined(B_OFFSET)
-#if defined(ADD_BIAS)
-                                  ,
-                                  biases_ptr,
-                                  biases_stride_x,
-                                  biases_step_x,
-                                  biases_offset_first_element_in_bytes
-#endif // defined(ADD_BIAS)
-                              );
-
-    __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_INT in_s32 = VLOAD(VEC_SIZE)(0, (__global int *)mm_result_addr);
-
-    // Add the offset terms to GEMM's result
-    in_s32 += offset_term_s32;
-
-    // -------------- OUTPUT STAGE
-
-    // Multiply by result_mult_int and shift
-#if defined(PER_CHANNEL_QUANTIZATION)
-    __global uchar *result_multipliers_addr   = result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
-    __global uchar *result_shifts_addr        = result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
-    VEC_INT         result_multipliers_values = VLOAD(VEC_SIZE)(0, (__global int *)result_multipliers_addr);
-    VEC_INT         result_shifts_values      = VLOAD(VEC_SIZE)(0, (__global int *)result_shifts_addr);
-
-    VEC_INT in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
-    VEC_INT in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, VEC_SIZE);
-    in_s32                   = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
-#else // defined(PER_CHANNEL_QUANTIZATION)
-
-#if RESULT_SHIFT < 0
-    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#else  // RESULT_SHIFT >= 0
-    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#endif // RESULT_SHIFT < 0
-
-#endif // defined(PER_CHANNEL_QUANTIZATION)
-
-    // Add the offset terms to GEMM's result
-    in_s32 += (VEC_INT)RESULT_OFFSET;
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && defined(OUTPUT_DATA_TYPE)
-
-#undef VEC_INT
-
-#endif // defined(K_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-
-#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)
- *  -# Clamp the resulting int32 values:
- *  -#  - to the [0..255] range and cast to QASYMM8.
- *  -#  - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
- * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
-#if defined(ADD_BIAS)
-                                                  VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                  TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += biases_values;
-#endif // defined(ADD_BIAS)
-
-    // Add the offset terms to GEMM's result
-    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET;
-
-    // Multiply by result_mult_int and shift
-    input_values *= RESULT_MULT_INT;
-
-#if RESULT_SHIFT < 0
-    input_values >>= -RESULT_SHIFT;
-#else  // RESULT_SHIFT >= 0
-    input_values >>= RESULT_SHIFT;
-#endif // RESULT_SHIFT < 0
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
-
-#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
- * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
-#if defined(ADD_BIAS)
-                                                             VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                             TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += biases_values;
-#endif // defined(ADD_BIAS)
-
-    // Multiply by result_mult_int and shift
-#if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#else  // RESULT_SHIFT >= 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#endif // RESULT_SHIFT < 0
-
-    // Add the offset terms to GEMM's result
-    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))RESULT_OFFSET_AFTER_SHIFT;
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-
-#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QSYMM16 value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Round to nearest division by a power-of-two using result_shift
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
- *
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
- * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QSYMM16
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
-#if defined(ADD_BIAS)
-                                                                     VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-                                                                     TENSOR3D_DECLARATION(dst))
-{
-    // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(short) + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += biases_values;
-#endif // defined(ADD_BIAS)
-
-    // Multiply by result_mult_int and shift
-#if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#else  // RESULT_SHIFT >= 0
-    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, VEC_SIZE);
-#endif // RESULT_SHIFT < 0
-
-    VEC_DATA_TYPE(short, VEC_SIZE)
-    res0 = CONVERT_SAT(input_values, VEC_DATA_TYPE(short, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(short, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, short, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
-
-#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
-/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
- *
- * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
- * The following computations will be performed by the kernel:
- *
- *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
- *  -# Add bias to final result if bias tensor is not a nullptr
- *  -# Requantize
- *  -# Add offset to each result
- *  -# Clamp the value between the specified min and max bounds
- *  -# Clamp the resulting int32 values:
- *      - to the [0..255] range and cast to QASYMM8.
- *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
- *
- * @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER
- *
- * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time
- * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
- * @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.
- *       These values can be used to implement "rectified linear unit" activation functions
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                              Pointer to the source tensor. Supported data type: S32
- * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data type: same as @p src_ptr
- * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in bytes)
- * @param[in]  biases_step_x                        biases_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases tensor
- * @param[out] dst_ptr                              Pointer to the destination tensor Supported data type: QASYMM8
- * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                           src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                           src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the destination tensor
- */
-__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
-#if defined(ADD_BIAS)
-                                                        VECTOR_DECLARATION(biases),
-#endif // defined(ADD_BIAS)
-#if defined(DST_HEIGHT)
-                                                        TENSOR4D_DECLARATION(dst))
-#else  // defined(DST_HEIGHT)
-                                                        TENSOR3D_DECLARATION(dst))
-#endif // defined(DST_HEIGHT)
-{
-    // Compute source and destination addresses
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    input_values = VLOAD(VEC_SIZE)(0, (__global int *)src_addr);
-
-#if defined(ADD_BIAS)
-    // Add bias
-    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    biases_values = VLOAD(VEC_SIZE)(0, (__global int *)bias_addr);
-    input_values += (VEC_DATA_TYPE(int, VEC_SIZE))biases_values;
-#endif // defined(ADD_BIAS)
-
-    // Convert to float
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    input_values_f = CONVERT(input_values, VEC_DATA_TYPE(float, VEC_SIZE));
-    input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
-
-    VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE));
-
-#if defined(MIN_BOUND)
-    res0 = max(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-    res0 = min(res0, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-
-    // Store the result
-    STORE_VECTOR_SELECT(res, OUTPUT_DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/gemv.cl
deleted file mode 100644
index aaa83975f8..0000000000
--- a/src/core/CL/cl_kernels/gemv.cl
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT)
-/** This kernel applies dot product to each plane on the input tensor and the corrispective column of the reshaped weight tensor.
- *
- * @note Datatype and source width and height should be given as a preprocessor argument using -DDATA_TYPE=type, -DSRC_WIDTH=width and -DSRC_HEIGHT=height. e.g. -DDATA_TYPE=short
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Same as @p src_ptr
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- */
-__kernel void gemm_mv(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(weights), VECTOR_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
-    int y = get_global_id(1) * 4;
-    int z = get_global_id(2);
-
-    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
-    __global uchar *input_ptr       = src.ptr;
-
-    DATA_TYPE acc0 = (DATA_TYPE)0;
-    DATA_TYPE acc1 = (DATA_TYPE)0;
-    DATA_TYPE acc2 = (DATA_TYPE)0;
-    DATA_TYPE acc3 = (DATA_TYPE)0;
-
-    // This kernel handle 4 rows in per thread so that it can reuse the weights
-    for(int i = 0; i < SRC_WIDTH; i += 4)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        weights = vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x));
-
-        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
-
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        tmp0 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        tmp1 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        tmp2 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        tmp3 = vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3));
-
-        acc0 += dot(weights, tmp0);
-        acc1 += dot(weights, tmp1);
-        acc2 += dot(weights, tmp2);
-        acc3 += dot(weights, tmp3);
-    }
-
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
-
-    int rows_left = SRC_HEIGHT - (y + 4);
-
-    // This if check is used to handle the last few rows when it can't be divided by the four
-    if(rows_left >= 0)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        out = (VEC_DATA_TYPE(DATA_TYPE, 4))(acc0, acc1, acc2, acc3);
-        vstore4(out, 0, (__global DATA_TYPE *)output_ptr);
-    }
-    else
-    {
-        switch(rows_left)
-        {
-            case -1: // three rows left; one is padding
-                *((__global DATA_TYPE *)(output_ptr + 2 * dst_stride_x)) = acc2;
-            case -2: // two rows left; two are padding
-                *((__global DATA_TYPE *)(output_ptr + 1 * dst_stride_x)) = acc1;
-            case -3: // one row left; three are padding
-                *((__global DATA_TYPE *)(output_ptr + 0 * dst_stride_x)) = acc0;
-                break;
-        }
-    }
-}
-
-/** This kernel applies dot product to each plane on the input tensor and the corresponding column of the reshaped weight tensor.
- *
- * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uchar
- *
- * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                            src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
- * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
- * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
- * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
- * @param[in]  weights_step_y                        weights_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: S32
- * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
- * @param[in]  input_offset                          Input's quantization offset
- * @param[in]  weights_offset                        Weights's quantization offset
- */
-__kernel void gemm_mv_quantized(TENSOR3D_DECLARATION(src),
-                                IMAGE_DECLARATION(weights),
-                                VECTOR_DECLARATION(dst),
-                                const int input_offset,
-                                const int weights_offset)
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-
-    int y = get_global_id(1) * 4;
-    int z = get_global_id(2);
-
-    __global uchar *current_weights = weights_ptr + weights_offset_first_element_in_bytes + z * weights_stride_y;
-    __global uchar *input_ptr       = src.ptr;
-
-    int acc0 = 0;
-    int acc1 = 0;
-    int acc2 = 0;
-    int acc3 = 0;
-
-    // This kernel handle 4 rows in per thread so that it can reuse the weights
-    for(int i = 0; i < SRC_WIDTH; i += 4)
-    {
-        int4 w = convert_int4(vload4(0, (__global DATA_TYPE *)(current_weights + i * weights_stride_x))) + (int4)weights_offset;
-
-        int4 offset = (int4)i * (int4)src_stride_x + (int4)(0, 1, 2, 3) * (int4)src_stride_y;
-
-        int4 tmp0 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s0))) + (int4)input_offset;
-        int4 tmp1 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s1))) + (int4)input_offset;
-        int4 tmp2 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s2))) + (int4)input_offset;
-        int4 tmp3 = convert_int4(vload4(0, (__global DATA_TYPE *)(input_ptr + offset.s3))) + (int4)input_offset;
-
-        // Accumulate
-        acc0 += tmp0.s0 * w.s0 + tmp0.s1 * w.s1 + tmp0.s2 * w.s2 + tmp0.s3 * w.s3;
-        acc1 += tmp1.s0 * w.s0 + tmp1.s1 * w.s1 + tmp1.s2 * w.s2 + tmp1.s3 * w.s3;
-        acc2 += tmp2.s0 * w.s0 + tmp2.s1 * w.s1 + tmp2.s2 * w.s2 + tmp2.s3 * w.s3;
-        acc3 += tmp3.s0 * w.s0 + tmp3.s1 * w.s1 + tmp3.s2 * w.s2 + tmp3.s3 * w.s3;
-    }
-
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y + z * SRC_HEIGHT) * dst_stride_x;
-
-    int rows_left = SRC_HEIGHT - (y + 4);
-
-    // This if check is used to handle the last few rows when it can't be divided by the four
-    if(rows_left >= 0)
-    {
-        vstore4((int4)(acc0, acc1, acc2, acc3), 0, (__global int *)output_ptr);
-    }
-    else
-    {
-        switch(rows_left)
-        {
-            case -1: // three rows left; one is padding
-                *((__global int *)(output_ptr + 2 * dst_stride_x)) = acc2;
-            case -2: // two rows left; two are padding
-                *((__global int *)(output_ptr + 1 * dst_stride_x)) = acc1;
-            case -3: // one row left; three are padding
-                *((__global int *)(output_ptr + 0 * dst_stride_x)) = acc0;
-                break;
-        }
-    }
-}
-#endif /* defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) */
diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/generate_proposals.cl
deleted file mode 100644
index e8306c55a8..0000000000
--- a/src/core/CL/cl_kernels/generate_proposals.cl
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
- * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE= Tensor data type. Supported data types: F16/F32
- * -# -DHEIGHT= Height of the feature map on which this kernel is applied
- * -# -DWIDTH= Width of the feature map on which this kernel is applied
- * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
- * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
- * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
- *
- * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: F16/F32
- * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
- * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
- * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
- * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
- * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
- * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
- * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
- * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
- */
-#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
-__kernel void generate_proposals_compute_all_anchors(
-    VECTOR_DECLARATION(anchors),
-    VECTOR_DECLARATION(rois))
-{
-    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
-    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
-
-    const size_t idx = get_global_id(0);
-    // Find the index of the anchor
-    const size_t anchor_idx = idx % NUM_ANCHORS;
-
-    // Find which shift is this thread using
-    const size_t shift_idx = idx / NUM_ANCHORS;
-
-    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
-    const DATA_TYPE
-    shift_x = (DATA_TYPE)(shift_idx % WIDTH) * STRIDE;
-    const DATA_TYPE
-    shift_y = (DATA_TYPE)(shift_idx / WIDTH) * STRIDE;
-
-    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
-    shift = (VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
-
-    // Read the given anchor
-    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
-    anchor = vload4(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS));
-
-    // Apply the shift to the anchor
-    const VEC_DATA_TYPE(DATA_TYPE, NUM_ROI_FIELDS)
-    shifted_anchor = anchor + shift;
-
-    vstore4(shifted_anchor, 0, (__global DATA_TYPE *)rois.ptr);
-}
-#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS)
diff --git a/src/core/CL/cl_kernels/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/generate_proposals_quantized.cl
deleted file mode 100644
index 04264197f4..0000000000
--- a/src/core/CL/cl_kernels/generate_proposals_quantized.cl
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-/** Generate all the region of interests based on the image size and the anchors passed in. For each element (x,y) of the
- * grid, it will generate NUM_ANCHORS rois, given by shifting the grid position to match the anchor.
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE= Tensor data type. Supported data types: QASYMM8
- * -# -DHEIGHT= Height of the feature map on which this kernel is applied
- * -# -DWIDTH= Width of the feature map on which this kernel is applied
- * -# -DNUM_ANCHORS= Number of anchors to be used to generate the rois per each pixel
- * -# -DSTRIDE= Stride to be applied at each different pixel position (i.e., x_range = (1:WIDTH)*STRIDE and y_range = (1:HEIGHT)*STRIDE
- * -# -DNUM_ROI_FIELDS= Number of fields used to represent a roi
- *
- * @param[in]  anchors_ptr                           Pointer to the anchors tensor. Supported data types: QASYMM8
- * @param[in]  anchors_stride_x                      Stride of the anchors tensor in X dimension (in bytes)
- * @param[in]  anchors_step_x                        anchors_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  anchors_stride_y                      Stride of the anchors tensor in Y dimension (in bytes)
- * @param[in]  anchors_step_y                        anchors_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  anchors_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  anchors_step_z                        anchors_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  anchors_offset_first_element_in_bytes The offset of the first element in the boxes tensor
- * @param[out] rois_ptr                              Pointer to the rois. Supported data types: same as @p in_ptr
- * @param[out] rois_stride_x                         Stride of the rois in X dimension (in bytes)
- * @param[out] rois_step_x                           pred_boxes_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[out] rois_stride_y                         Stride of the rois in Y dimension (in bytes)
- * @param[out] rois_step_y                           pred_boxes_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[out] rois_stride_z                         Stride of the rois in Z dimension (in bytes)
- * @param[out] rois_step_z                           pred_boxes_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[out] rois_offset_first_element_in_bytes    The offset of the first element in the rois
- */
-#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
-__kernel void generate_proposals_compute_all_anchors_quantized(
-    VECTOR_DECLARATION(anchors),
-    VECTOR_DECLARATION(rois))
-{
-    Vector anchors = CONVERT_TO_VECTOR_STRUCT_NO_STEP(anchors);
-    Vector rois    = CONVERT_TO_VECTOR_STRUCT(rois);
-
-    const size_t idx = get_global_id(0);
-    // Find the index of the anchor
-    const size_t anchor_idx = idx % NUM_ANCHORS;
-
-    // Find which shift is this thread using
-    const size_t shift_idx = idx / NUM_ANCHORS;
-
-    // Compute the shift on the X and Y direction (the shift depends exclusively by the index thread id)
-    const float shift_x = (float)(shift_idx % WIDTH) * STRIDE;
-    const float shift_y = (float)(shift_idx / WIDTH) * STRIDE;
-
-    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
-    shift = (VEC_DATA_TYPE(float, NUM_ROI_FIELDS))(shift_x, shift_y, shift_x, shift_y);
-
-    // Read the given anchor
-    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
-    anchor = DEQUANTIZE(VLOAD(NUM_ROI_FIELDS)(0, (__global DATA_TYPE *)vector_offset(&anchors, anchor_idx * NUM_ROI_FIELDS)), OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS);
-
-    // Apply the shift to the anchor
-    VEC_DATA_TYPE(float, NUM_ROI_FIELDS)
-    shifted_anchor = anchor + shift;
-
-    VSTORE(NUM_ROI_FIELDS)
-    (QUANTIZE(shifted_anchor, OFFSET, SCALE, DATA_TYPE, NUM_ROI_FIELDS), 0, (__global DATA_TYPE *)rois.ptr);
-}
-#endif //defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(NUM_ANCHORS) && defined(STRIDE) && defined(NUM_ROI_FIELDS) && defined(OFFSET) && defined(SCALE)
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
deleted file mode 100644
index a1467a0b36..0000000000
--- a/src/core/CL/cl_kernels/im2col.cl
+++ /dev/null
@@ -1,1360 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
-
-#if ELEMENT_SIZE == 1
-#define COND_DATA_TYPE char
-#elif ELEMENT_SIZE == 2
-#define COND_DATA_TYPE short
-#elif ELEMENT_SIZE == 4
-#define COND_DATA_TYPE int
-#else // ELEMENT_SIZE
-#error "Element size not support"
-#endif // ELEMENT_SIZE
-
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col1x1_stridex1_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const uint xc    = get_global_id(0) * 4;         // x coordinate in the convolved tensor
-    const uint yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const uint ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Clamp xc
-    // The strategy clamps at "xc" as it will be a valid value for sure
-    uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
-
-    // Check which values are valid
-    const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
-
-    xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
-
-    // Calculate input indices
-    const uint xi = xc;
-    const uint yi = yc * STRIDE_Y;
-
-    // Calculate output indices
-
-#if defined(NUM_GROUPS)
-    const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
-    const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                                   // defined(NUM_GROUPS)
-    const uint xo              = ch;
-#endif                                                  // defined(NUM_GROUPS)
-    const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
-#else  // defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
-#endif // defined(NUM_GROUPS)
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    data = vload4(0, (__global DATA_TYPE *)input_ptr);
-
-    // If out-of-bound, overwrite with the first element
-    data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
-
-    *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
-    *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
-    *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
-    *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
-        *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
-        *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
-        *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-#if defined(DILATION_X) && defined(DILATION_Y)
-/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X - PAD_LEFT;
-    const int yi = yc * STRIDE_Y - PAD_TOP;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
-#else  // defined(NUM_GROUPS)
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
-#endif // defined(NUM_GROUPS)
-
-    // Linearize convolution elements
-    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-    {
-        int y = yi + yk * DILATION_Y;
-        for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
-        {
-            int x = xi + xk * DILATION_X;
-#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-            *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
-            {
-                *output_ptr = PAD_VALUE;
-            }
-            else
-            {
-                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-            }
-#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
-        }
-    }
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *output_ptr = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(DILATION_X) && defined(DILATION_Y)
-
-/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X - PAD_LEFT;
-    const int yi = yc * STRIDE_Y - PAD_TOP;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo               = ch * 9; // 3x3
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
-#else  // defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
-#endif // defined(NUM_GROUPS)
-
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-    // Put 0 if the value is out-of-bound
-    int3 x = (int3)xi + (int3)(0, 1, 2);
-    int3 y = (int3)yi + (int3)(0, 1, 2);
-
-    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
-    cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
-    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
-    cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
-    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
-    cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
-
-    row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
-    row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
-    row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
-    *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col5x5_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X - PAD_LEFT;
-    const int yi = yc * STRIDE_Y - PAD_TOP;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo               = ch * 25; // 5x5
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-    // Put 0 if the value is out-of-bound
-    int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
-    int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
-    int  x1 = xi + 4;
-    int  y1 = yi + 4;
-
-    // Check if we could have out-of-bounds elements in the x direction
-    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-    x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
-    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-    y0_condition                = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
-    COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
-    COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
-#else  // defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
-#endif // defined(NUM_GROUPS)
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row01 = *((__global DATA_TYPE *)input_ptr + 4);
-
-        input_ptr += src_stride_y;
-
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row11 = *((__global DATA_TYPE *)input_ptr + 4);
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
-        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
-        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
-
-        // Replace with 0 if the value is not valid
-        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
-        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
-        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
-        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
-                                              row10.s012),
-                0, (__global DATA_TYPE *)output_ptr);
-        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 10 * dst_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row01 = *((__global DATA_TYPE *)input_ptr + 4);
-
-        input_ptr += src_stride_y;
-
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row11 = *((__global DATA_TYPE *)input_ptr + 4);
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
-        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
-        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
-
-        // Replace with 0 if the value is not valid
-        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
-        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
-        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
-        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
-                                              row10.s012),
-                0, (__global DATA_TYPE *)output_ptr);
-        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 10 * dst_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 4)
-        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
-        DATA_TYPE
-        row01 = *((__global DATA_TYPE *)input_ptr + 4);
-
-        input_ptr += src_stride_y;
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
-        cond00                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
-        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
-
-        // Replace with 0 if the value is not valid
-        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
-        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-        vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
-        *((__global DATA_TYPE *)output_ptr + 4) = row01;
-
-        output_ptr += 5 * dst_stride_x;
-    }
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *((__global DATA_TYPE *)output_ptr) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
-
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col11x11_padx0_pady0_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X;
-    const int yi = yc * STRIDE_Y;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo               = ch * 121; // 11x11
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    // Get input and output address
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
-#else  // defined(NUM_GROUPS)
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
-#endif // defined(NUM_GROUPS)
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        input_ptr += src_stride_y;
-        output_ptr += 11 * src_stride_x;
-    }
-
-    {
-        VEC_DATA_TYPE(DATA_TYPE, 8)
-        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
-        VEC_DATA_TYPE(DATA_TYPE, 3)
-        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
-
-        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
-        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
-
-        output_ptr += 11 * src_stride_x;
-    }
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *((__global DATA_TYPE *)output_ptr) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
-
-#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
- * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_padx0_pady0_nchw(
-    TENSOR3D_DECLARATION(src),
-#if defined(NUM_GROUPS)
-    TENSOR3D_DECLARATION(dst),
-#else  // defined(NUM_GROUPS)
-    IMAGE_DECLARATION(dst),
-#endif // defined(NUM_GROUPS)
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
-    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
-    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
-    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
-
-    // Calculate input indices
-    const int xi = xc * STRIDE_X;
-    const int yi = yc * STRIDE_Y;
-
-    // Calculate output indices
-#if defined(NUM_GROUPS)
-    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
-    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
-#else                                         // defined(NUM_GROUPS)
-    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
-#endif                                        // defined(NUM_GROUPS)
-    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
-
-    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
-#if defined(NUM_GROUPS)
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
-#else  // defined(NUM_GROUPS)
-    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
-#endif // defined(NUM_GROUPS)
-
-    // Linearize convolution elements
-    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
-    {
-        int last_x = 0;
-        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
-        {
-            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
-            VSTORE(VECTOR_SIZE)
-            (row, 0, output_ptr);
-            last_x = x;
-        }
-        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
-        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
-#if WIDTH_MOD_VECTOR_SIZE == 1
-        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
-#elif WIDTH_MOD_VECTOR_SIZE > 1
-        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
-        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
-        VSTORE(WIDTH_MOD_VECTOR_SIZE)
-        (row, 0, output_ptr);
-#endif /* WIDTH_MOD_VECTOR_SIZE */
-        output_ptr += WIDTH_MOD_VECTOR_SIZE;
-    } /* End of loop over KERNEL_HEIGHT */
-
-#ifdef HAS_BIAS
-#if defined(NUM_GROUPS)
-    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
-#else  // defined(NUM_GROUPS)
-    if(ch == (SRC_DEPTH - 1))
-#endif // defined(NUM_GROUPS)
-    {
-        *output_ptr = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-
-#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
-
-/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
- *  @name IM2COL1X9_NHWC_STORE
- *
- *  @note To use this macro for a 3x3 block, @p ROW has to be 0
- *
- * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
- * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
- * @param[in] DATA_TYPE            Data type of @p DATA
- * @param[in] SRC_DEPTH            Input channel size / depth
- * @param[in] DATA                 Value variable base name
- * @param[in] ROW                  The row number to store. Supported: 0-8
- * @param[in] OUTPUT_PTR           Output pointer
- * @{
- */
-#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
-    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
-    if(at_channel_boundary)                                                                                          \
-    {                                                                                                                \
-        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    }                                                                                                                \
-    else                                                                                                             \
-    {                                                                                                                \
-        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
-    }
-#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
-#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-
-#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
-    VSTORE(VECTOR_SIZE)                                                                           \
-    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-
-#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
-    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
-    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-/** @}*/
-
-/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int  yi_coord = 0;
-    int3 offset   = 0;
-
-    // Clamp xi
-    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
-    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
-    xi_offset *= (int3)src_stride_y;
-
-    // Out-of-bound condition for X
-    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
-
-    // yi == 0
-    // Clamp yi
-    // yi_coord is casted to unsigned int in order to use just a min() operation
-    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
-    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
-    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
-    // also causes y_cond (y padding condition) to be satisfied
-    yi_coord = yi - (int)PAD_TOP;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with PAD_VALUE
-    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
-    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // yi == 1
-    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
-    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with zeros
-    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // yi == 2
-    // Clamp yi_coord
-    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
-
-    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
-    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
-    // Compute offset
-    offset = xi_offset + (yi_coord * (int)src_stride_z);
-
-    // Load input values
-    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
-    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
-    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-    // Replace invalid values with PAD_VALUE
-    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
-    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
-    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
-    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-    // Store in a boundary-aware way to avoid padding
-    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                         \
-    ({                                                                                                       \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
-        \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
-        \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
-        \
-        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
-        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
-        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
-        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
-        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
-        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
-        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
-        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
-        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
-        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
-        \
-        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
-    })
-#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i)                                                                                         \
-    ({                                                                                                       \
-        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
-        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
-        \
-        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
-        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
-        \
-        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
-        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
-        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
-        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
-        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
-        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
-        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
-        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
-        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
-        \
-        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
-    })
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col9x9_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int  yi_coord = 0;
-    int8 offset0  = 0;
-    int  offset1  = 0;
-
-    // Clamp xi
-    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
-    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
-
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
-    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
-    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
-    xi_offset0 *= (int8)src_stride_y;
-    xi_offset1 *= (int)src_stride_y;
-
-    // Out-of-bound condition for X
-    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
-    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
-    IM2COL1x9(0);
-    IM2COL1x9(1);
-    IM2COL1x9(2);
-    IM2COL1x9(3);
-    IM2COL1x9(4);
-    IM2COL1x9(5);
-    IM2COL1x9(6);
-    IM2COL1x9(7);
-    IM2COL1x9(8);
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-
-/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
- * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_nhwc(
-    TENSOR3D_DECLARATION(src),
-    IMAGE_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
-    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
-    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
-    const int yo           = get_global_id(1);
-    const int batch        = get_global_id(2); // batch size
-
-    // Calculate input indices
-    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
-    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
-    // Get input and output address
-    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
-    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
-    int i = 0;
-    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
-    {
-        // Clamp yi_coord
-        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
-        yi_coord     = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
-
-        // Out-of-bound condition for Y
-        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
-
-        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
-        {
-            // Clamp xi_coord
-            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
-            xi_coord     = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
-
-            // Out-of-bound condition for X
-            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
-            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
-
-            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-            // Replace with PAD_VALUE if the value is out-of-bound
-            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
-            // Store in a boundary-aware way to avoid padding
-#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
-            const bool at_channel_boundary = get_global_id(0) == 0;
-            if(at_channel_boundary)
-            {
-                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
-                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-            }
-            else // at_channel_boundary
-#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
-            {
-                VSTORE(VECTOR_SIZE)
-                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
-            }
-            i++;
-        }
-    }
-
-#ifdef HAS_BIAS
-    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
-    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
-    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
-    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
-    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
-    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
-    {
-        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
-    }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/instance_normalization.cl
deleted file mode 100644
index adfbebd67d..0000000000
--- a/src/core/CL/cl_kernels/instance_normalization.cl
+++ /dev/null
@@ -1,254 +0,0 @@
-/*
- * Copyright (c) 2019-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
-/** This function computes the mean and variance of each plane of the input tensor and provides it as output.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
- * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- */
-__kernel void compute_mean_var(
-    TENSOR4D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-#if defined(NHWC)
-    const int          ch             = get_global_id(0); // Current channel
-    const int          batch          = get_global_id(1); // Current batch
-    const int          elements_plane = DIM_Y * DIM_Z;
-    INTERNAL_DATA_TYPE part_sum       = 0.f;
-    INTERNAL_DATA_TYPE part_sum_sq    = 0.f;
-    const int          in_offset      = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
-
-    for(int i_w = 0; i_w < DIM_Y; ++i_w)
-    {
-        for(int i_h = 0; i_h < DIM_Z; ++i_h)
-        {
-            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch));
-            part_sum += data;
-            part_sum_sq += data * data;
-        }
-    }
-
-    INTERNAL_DATA_TYPE mean                      = (part_sum / elements_plane);
-    INTERNAL_DATA_TYPE var                       = (part_sum_sq / elements_plane) - (mean * mean);
-    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
-    *output_address0                             = mean;
-    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
-    *output_address1                             = var;
-#else // !defined(NHWC)
-    const int ch             = get_global_id(2) % DIM_Z; // Current channel
-    const int batch          = get_global_id(2) / DIM_Z; // Current batch
-    const int elements_plane = DIM_X * DIM_Y;
-
-    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-    part_sum = 0.f;
-    VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-    part_sum_sq = 0.f;
-    // Calculate partial sum
-    for(int y = 0; y < DIM_Y; ++y)
-    {
-        int x = 0;
-        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
-        {
-            // Load data
-            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-            data = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
-            part_sum += data;
-            part_sum_sq += data * data;
-        }
-        // Left-overs loop
-        for(; x < DIM_X; ++x)
-        {
-            INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch)));
-            part_sum.s0 += data;
-            part_sum_sq.s0 += data * data;
-        }
-    }
-    // Perform reduction
-#if VEC_SIZE > 8
-    part_sum.s01234567 += part_sum.s89abcdef;
-    part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
-#endif // VEC_SIZE > 8
-#if VEC_SIZE > 4
-    part_sum.s0123 += part_sum.s4567;
-    part_sum_sq.s0123 += part_sum_sq.s4567;
-#endif // VEC_SIZE > 4
-#if VEC_SIZE > 2
-    part_sum.s01 += part_sum.s23;
-    part_sum_sq.s01 += part_sum_sq.s23;
-#endif // VEC_SIZE > 2
-    part_sum.s0 += part_sum.s1;
-    part_sum_sq.s0 += part_sum_sq.s1;
-
-    INTERNAL_DATA_TYPE sum    = (INTERNAL_DATA_TYPE)part_sum.s0;
-    INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
-
-    const INTERNAL_DATA_TYPE mean = (sum / elements_plane);
-    const INTERNAL_DATA_TYPE var  = (sum_sq / elements_plane) - (mean * mean);
-
-    __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 0, batch);
-    *output_address0                             = mean;
-    __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&out, ch, 1, batch);
-    *output_address1                             = var;
-
-#endif // defined(NHWC)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
-/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE=data_type compile flag, e.g. -DDATA_TYPE=float
- * @attention The scale scalar value applied to the normalized tensor should be passed using the -DGAMMA=value compile flag, e.g. -DGAMMA=1.3
- * @attention The offset scalar value applied to the normalized tensor should be passed using the -DBETA=value compile flag, e.g. -DBETA=2.4
- * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
- * @attention Dimensions X, Y, and Z should be given as a preprocessor argument with -DDIM_X=value, -DDIM_Y=value, -DDIM_Z=value. e.g. -DDIM_X=6, -DDIM_Y=2, -DDIM_Z=7
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      (Optional) Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        (Optional) output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- */
-__kernel void instance_normalization(
-    TENSOR4D_DECLARATION(input),
-    TENSOR3D_DECLARATION(mean_var)
-#ifndef IN_PLACE
-    ,
-    TENSOR4D_DECLARATION(output)
-#endif /* IN_PLACE */
-)
-{
-    Tensor4D in       = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D mean_var = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(mean_var);
-#ifndef IN_PLACE
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-#endif /* IN_PLACE */
-
-#if defined(NHWC)
-    const int ch    = get_global_id(0); // Current channel
-    const int batch = get_global_id(2); // Current batch
-#else                                   /* defined(NHWC) */
-    const int ch    = get_global_id(2) % DIM_Z; // Current channel
-    const int batch = get_global_id(2) / DIM_Z; // Current batch
-#endif                                  /* defined(NHWC) */
-
-    const __global INTERNAL_DATA_TYPE *mean_ptr                   = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 0, batch);
-    const __global INTERNAL_DATA_TYPE *var_ptr                    = (__global INTERNAL_DATA_TYPE *)tensor3D_offset(&mean_var, ch, 1, batch);
-    const INTERNAL_DATA_TYPE                               mean   = (INTERNAL_DATA_TYPE) * mean_ptr;
-    const INTERNAL_DATA_TYPE                               var    = (INTERNAL_DATA_TYPE) * var_ptr;
-    const INTERNAL_DATA_TYPE                               multip = GAMMA / sqrt(var + EPSILON);
-    const INTERNAL_DATA_TYPE                               beta   = (INTERNAL_DATA_TYPE)BETA;
-
-#if defined(NHWC)
-    const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
-#ifndef IN_PLACE
-    const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch * sizeof(DATA_TYPE);
-#endif /* IN_PLACE */
-
-    for(int i_w = 0; i_w < DIM_Y; ++i_w)
-    {
-        for(int i_h = 0; i_h < DIM_Z; ++i_h)
-        {
-            __global DATA_TYPE *input_address = (__global DATA_TYPE *)tensor4D_offset(&in, ch, i_w, i_h, batch);
-#ifdef IN_PLACE
-            __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, ch, i_w, i_h, batch);
-#endif /* IN_PLACE */
-            *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
-        }
-    }
-#else // !defined(NHWC)
-    for(int y = 0; y < DIM_Y; ++y)
-    {
-        int x = 0;
-        for(; x <= (DIM_X - VEC_SIZE); x += VEC_SIZE)
-        {
-            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
-#ifdef IN_PLACE
-            __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
-#endif /* IN_PLACE */
-
-            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-            data = CONVERT(VLOAD(VEC_SIZE)(0, input_address), VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE));
-
-            VEC_DATA_TYPE(INTERNAL_DATA_TYPE, VEC_SIZE)
-            res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
-            VSTORE(VEC_SIZE)
-            (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, output_address);
-        }
-        // Left-overs loop
-        for(; x < DIM_X; ++x)
-        {
-            __global DATA_TYPE *input_address  = (__global DATA_TYPE *)tensor4D_offset(&in, x, y, ch, batch);
-#ifdef IN_PLACE
-            __global DATA_TYPE *output_address = input_address;
-#else  /* !IN_PLACE */
-            __global DATA_TYPE *output_address = (__global DATA_TYPE *)tensor4D_offset(&out, x, y, ch, batch);
-#endif /* IN_PLACE */
-            *(output_address)                  = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
-        }
-    }
-#endif // defined(NHWC)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z) */
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/l2_normalize.cl
deleted file mode 100644
index fbe3406239..0000000000
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ /dev/null
@@ -1,189 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
-/** This kernel performs l2 normalization on x-axis
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  epsilon                              Epsilon value
- */
-__kernel void l2_normalize_x(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(sum),
-    IMAGE_DECLARATION(output),
-    DATA_TYPE epsilon)
-{
-    // Offset computation
-    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
-
-    // Address computation
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
-    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + get_global_id(1) * sum_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(*((__global DATA_TYPE *)sum_addr), epsilon));
-
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    data0 = in * normalize_value;
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
-}
-
-/** This kernel performs l2 normalization on y-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  epsilon                              Epsilon value
- */
-__kernel void l2_normalize_y(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(sum),
-    IMAGE_DECLARATION(output),
-    DATA_TYPE epsilon)
-{
-    // Offset computation
-    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
-
-    // Address computation
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
-    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    sums = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    normalize_value = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))rsqrt(fmax(sums, epsilon));
-
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    data0 = in * normalize_value;
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
-}
-
-/** This kernel performs l2 normalization on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE_X=size. e.g. -DVEC_SIZE_X=16
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER_X is; x_dimension % VEC_SIZE_X. e.g. -DVEC_SIZE_LEFTOVER_X=1
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  sum_ptr                              Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  sum_stride_x                         Stride of the source tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                           sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                         Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                           sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                         Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                           sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes    The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  epsilon                              Epsilon value
- */
-__kernel void l2_normalize_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(output),
-    DATA_TYPE epsilon)
-{
-    // Offset computation
-    const uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
-
-    // Address computation
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-    __global uchar *sum_addr    = sum_ptr + sum_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * sum_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    in = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)input_addr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    sums = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)sum_addr);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    data0 = in * ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X))(rsqrt(fmax(sums, epsilon))));
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, output_addr, VEC_SIZE_X, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_X != 0 && get_global_id(0) == 0);
-}
-#endif // defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/mean_stddev_normalization.cl
deleted file mode 100644
index 76be629934..0000000000
--- a/src/core/CL/cl_kernels/mean_stddev_normalization.cl
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019, 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH)
-/** This function normalizes the input 2D tensor across the first dimension with respect to mean and standard deviation of the same dimension.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
- * @attention Normalization epsilon parameter should be given as a preprocessor argument with -DEPSILON=value. e.g. -DEPSILON=0.001f
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           (Optional) Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      (Optional) Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        (Optional) output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      (Optional) Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        (Optional) output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes (Optional) The offset of the first element in the destination tensor
- */
-__kernel void mean_stddev_normalization(
-    IMAGE_DECLARATION(input)
-#ifndef IN_PLACE
-    ,
-    IMAGE_DECLARATION(output)
-#endif /* IN_PLACE */
-)
-{
-    // Get pixels pointer
-    Image in = CONVERT_TO_IMAGE_STRUCT(input);
-#ifdef IN_PLACE
-    Image out = in;
-#else  /* IN_PLACE */
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-#endif /* IN_PLACE */
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    sum = 0.f;
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    sum_sq = 0.f;
-    // Calculate partial sum
-    int i = 0;
-    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
-    {
-        // Load data
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
-
-        sum += data;
-        sum_sq += data * data;
-    }
-    // Perform reduction
-    sum    = SUM_REDUCE(sum, VEC_SIZE);
-    sum_sq = SUM_REDUCE(sum_sq, VEC_SIZE);
-
-#if VEC_SIZE > 1
-#define sum sum.s0
-#define sum_sq sum_sq.s0
-#endif // VEC_SIZE > 1
-
-    // Left-overs loop
-    for(; i < WIDTH; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
-
-        sum += data;
-        sum_sq += data * data;
-    }
-
-    DATA_TYPE mean       = sum / WIDTH;
-    DATA_TYPE var        = (sum_sq / WIDTH) - (mean * mean);
-    DATA_TYPE stddev_inv = 1.f / sqrt(var + EPSILON);
-
-    i = 0;
-    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&in, i, 0));
-
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        res = (data - mean) * stddev_inv;
-        VSTORE(VEC_SIZE)
-        (res, 0, (__global DATA_TYPE *)offset(&out, i, 0));
-    }
-    for(; i < WIDTH; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)offset(&in, i, 0));
-
-        *((__global DATA_TYPE *)offset(&out, i, 0)) = (data - mean) * stddev_inv;
-    }
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(EPSILON) && defined(WIDTH) */
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/memset.cl
deleted file mode 100644
index bb46a49f84..0000000000
--- a/src/core/CL/cl_kernels/memset.cl
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
-
-/** Fill the tensor's planes with all value
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
- * -# -DVEC_SIZE = Vector size
- * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in] tensor_ptr                           Pointer to the source image. Data types supported: All.
- * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] value                                The value used to fill the pages of the tensor
- */
-__kernel void memset(
-    TENSOR3D_DECLARATION(tensor))
-{
-    Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
-
-#if defined(VEC_SIZE)
-
-#if defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
-#endif // defined(LAST_ACCESSED_X)
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = (DATA_TYPE)(CONSTANT_VALUE);
-
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)tensor.ptr);
-#else  // !defined(VEC_SIZE)
-    *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
-#endif // defined(VEC_SIZE)
-}
-
-#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/minmax_layer.cl
deleted file mode 100644
index 655696f9a1..0000000000
--- a/src/core/CL/cl_kernels/minmax_layer.cl
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
-/** This function identifies the min and maximum value of an input 3D tensor.
- *
- * @note The width, height and depth of the input tensor must be provided at compile time using -DWIDTH, -DHEIGHT and -DDEPTH (e.g. -DWIDTH=320, -DHEIGHT=240, -DDEPTH=3)
- *
- * @param[in] src_ptr                           Pointer to the source tensor. Supported data types: F32
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z                      Stride of the source image in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] dst_ptr                           Pointer to the min/max vector. Minimum value in position 0, maximum value in position 1. Supported data types: F32.
- * @param[in] dst_stride_x                      Stride of the min/max vector in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the min/max vector
- */
-__kernel void minmax_layer(
-    TENSOR3D_DECLARATION(src),
-    VECTOR_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Vector   dst = CONVERT_TO_VECTOR_STRUCT(dst);
-
-    float4 min_value     = (float4)FLT_MAX;
-    float4 max_value     = (float4) - FLT_MAX;
-    float2 min_max_value = (float2)(FLT_MAX, -FLT_MAX);
-
-    for(int z = 0; z < DEPTH; ++z)
-    {
-        for(int y = 0; y < HEIGHT; ++y)
-        {
-            int             x        = 0;
-            __global float *src_addr = (__global float *)(src.ptr + y * src_stride_y + z * src_stride_z);
-
-            for(; x <= (int)(WIDTH - 8); x += 8)
-            {
-                float8 value = *(src_addr + x);
-
-                min_value = select(value.s0123, min_value, min_value < value.s0123);
-                min_value = select(value.s4567, min_value, min_value < value.s4567);
-
-                max_value = select(value.s0123, max_value, max_value > value.s0123);
-                max_value = select(value.s4567, max_value, max_value > value.s4567);
-            }
-
-            for(; x < WIDTH; ++x)
-            {
-                float value = *(src_addr + x);
-
-                min_max_value.s0 = min(min_max_value.s0, value);
-                min_max_value.s1 = max(min_max_value.s1, value);
-            }
-        }
-    }
-
-    // Perform min/max reduction
-    min_value.s01 = min(min_value.s01, min_value.s23);
-    min_value.s0  = min(min_value.s0, min_value.s1);
-    max_value.s01 = max(max_value.s01, max_value.s23);
-    max_value.s0  = max(max_value.s0, max_value.s1);
-
-    min_max_value.s0 = min(min_max_value.s0, min_value.s0);
-    min_max_value.s1 = max(min_max_value.s1, max_value.s0);
-
-    if(min_max_value.s0 == min_max_value.s1)
-    {
-        min_max_value.s0 = 0.0f;
-        min_max_value.s1 = 1.0f;
-    }
-
-    // Store min and max
-    vstore2(min_max_value, 0, (__global float *)dst.ptr);
-}
-#endif // defined(WIDTH) && defined(HEIGHT) && defined(DEPTH)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/batch_to_space.cl b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
new file mode 100644
index 0000000000..89129cff3f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    VECTOR_DECLARATION(block_shape),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
new file mode 100644
index 0000000000..2d466661b3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+    Tensor3D out = in;
+#else  /* IN_PLACE */
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+    Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector var  = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+    Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res = 0;
+
+    const int current_slice = get_global_id(2);
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+    denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+    res = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+    // beta is not zero, hence we need to perform the addition
+    res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/channel_shuffle.cl b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
new file mode 100644
index 0000000000..57d82e1e6f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
@@ -0,0 +1,103 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    uint curr_channel = 0; // channel id of input
+    uint batch_id     = 0; // batch id
+    uint group_id     = 0; // group id
+    uint channel_id   = 0; // channel id within the group
+
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
+
+    // Compute group_id and channel_id
+    DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+    const uint x = get_global_id(0) * VEC_SIZE;
+    const uint y = get_global_id(1) * 2;
+    const uint z = channel_id * NUM_GROUPS + group_id;
+
+    // Load the Nx2 block
+    const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+    // Store blocks
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    VSTORE(VEC_SIZE)
+    (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+    VSTORE(VEC_SIZE)
+    (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
new file mode 100644
index 0000000000..b9f223fe9d
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nchw(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/dequantization_layer.cl b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
new file mode 100644
index 0000000000..e0203f7408
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = scale[get_global_id(2)];
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
new file mode 100644
index 0000000000..8ab2d1d4ea
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#if defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 3
+#define INPUT_PIXEL_STR(data_size) extract_input_stride3_##data_size
+#define INPUT_PIXEL(data_size) INPUT_PIXEL_STR(data_size)
+#elif STRIDE_X == 2
+#define INPUT_PIXEL(data_size) extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_PIXEL(data_size) extract_input_stride1
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X == 3 */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_pixel)
+{
+    return vload8(0, input_pixel);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp = vload16(0, input_pixel);
+    return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 32-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_32(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp1 = vload4(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp2 = vload4(0, input_pixel + 6);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp3 = vload4(0, input_pixel + 12);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    temp4 = vload4(0, input_pixel + 18);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s03, temp2.s03, temp3.s03, temp4.s03);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 16-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_16(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp1 = vload8(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp2 = vload8(0, input_pixel + 8);
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    temp3 = vload8(0, input_pixel + 16);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s036, temp2.s147, temp3.s25);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_pixel Pointer to the first pixel.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3_8(__global const DATA_TYPE *input_pixel)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp1 = vload16(0, input_pixel);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp2 = vload16(0, input_pixel + 12);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
+}
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The data size must be passed at compile time using -DDATA_SIZE e.g. -DDATA_SIZE=32
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+#endif /* defined(HAS_BIAS) */
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+    values = 0;
+
+    const uint z_index = get_global_id(2);
+
+    weights.ptr += z_index * weights_stride_w;
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        DATA_TYPE weight = *(__global DATA_TYPE *)weights.ptr;
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        input_pixel = INPUT_PIXEL(DATA_SIZE)((__global DATA_TYPE *)src.ptr);
+        values      = ADD_OP(values, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))weight, input_pixel));
+        src.ptr += src_stride_z;
+        weights.ptr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    values = ADD_OP(values, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, z_index))));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(CONVERT_SAT(values, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(DATA_SIZE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x1_BIFROST(acc, src, weight_value) \
+    ({                                                 \
+        acc.s0 = mad(src.s0, weight_value, acc.s0);    \
+        acc.s1 = mad(src.s1, weight_value, acc.s1);    \
+        acc.s2 = mad(src.s2, weight_value, acc.s2);    \
+        acc.s3 = mad(src.s3, weight_value, acc.s3);    \
+    })
+
+/** An optimized direct convolution 1x1 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution1x1_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 acc0 = 0.0f;
+    float4 acc1 = 0.0f;
+    float4 acc2 = 0.0f;
+    float4 acc3 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights
+        float weight = *((__global float *)weights_addr);
+
+        // Load values from row0 of input tensor
+        float4 src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+        float4 src1 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+        float4 src2 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+        float4 src3 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+        CONVOLUTION1x1_BIFROST(acc0, src0, weight);
+        CONVOLUTION1x1_BIFROST(acc1, src1, weight);
+        CONVOLUTION1x1_BIFROST(acc2, src2, weight);
+        CONVOLUTION1x1_BIFROST(acc3, src3, weight);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    acc0.s0 += bias;
+    acc0.s1 += bias;
+    acc0.s2 += bias;
+    acc0.s3 += bias;
+    acc1.s0 += bias;
+    acc1.s1 += bias;
+    acc1.s2 += bias;
+    acc1.s3 += bias;
+    acc2.s0 += bias;
+    acc2.s1 += bias;
+    acc2.s2 += bias;
+    acc2.s3 += bias;
+    acc3.s0 += bias;
+    acc3.s1 += bias;
+    acc3.s2 += bias;
+    acc3.s3 += bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(acc0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(acc1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(acc2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+    vstore4(acc3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
new file mode 100644
index 0000000000..811df053c4
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#undef CONVERT_SAT
+
+#define ADD_OP(a, b) ((a) + (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define CONVERT_SAT(a, b) ((a))
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                                                  \
+    ({                                                                                                                                             \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                                \
+        weights_values0 = vload3(0, weights_row_ptr);                                                                                              \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                                                \
+        src0 = vload8(0, src_row_ptr);                                                                                                             \
+        VEC_DATA_TYPE(DATA_TYPE, 2)                                                                                                                \
+        src1 = vload2(0, src_row_ptr + 8);                                                                                                         \
+        \
+        acc = ADD_OP(acc, MUL_OP(src0, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                          \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1)); \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+    })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                                               \
+    ({                                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 3)                                                                                                             \
+        weights_values0 = vload3(0, weights_row_ptr);                                                                                           \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                                            \
+        src0           = vload16(0, src_row_ptr);                                                                                               \
+        DATA_TYPE src1 = *(src_row_ptr + 16);                                                                                                   \
+        \
+        acc = ADD_OP(acc, MUL_OP(src0.even, (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0));                                                  \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1));      \
+        acc = ADD_OP(acc, MUL_OP((VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1), (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2)); \
+    })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note This OpenCL kernel works with stride_x = 1 and 2
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)
+    values0 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    values0 = ADD_OP(values0, (VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index))));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(CONVERT_SAT(values0, VEC_DATA_TYPE(DATA_TYPE, 8)), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif //defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x3_BIFROST(acc, src0, src1, weights_row0) \
+    ({                                                        \
+        acc.s0 = mad(src0.s0, weights_row0.s0, acc.s0);       \
+        acc.s1 = mad(src0.s1, weights_row0.s0, acc.s1);       \
+        acc.s2 = mad(src0.s2, weights_row0.s0, acc.s2);       \
+        acc.s3 = mad(src0.s3, weights_row0.s0, acc.s3);       \
+        acc.s0 = mad(src0.s1, weights_row0.s1, acc.s0);       \
+        acc.s1 = mad(src0.s2, weights_row0.s1, acc.s1);       \
+        acc.s2 = mad(src0.s3, weights_row0.s1, acc.s2);       \
+        acc.s3 = mad(src1.s0, weights_row0.s1, acc.s3);       \
+        acc.s0 = mad(src0.s2, weights_row0.s2, acc.s0);       \
+        acc.s1 = mad(src0.s3, weights_row0.s2, acc.s1);       \
+        acc.s2 = mad(src1.s0, weights_row0.s2, acc.s2);       \
+        acc.s3 = mad(src1.s1, weights_row0.s2, acc.s3);       \
+    })
+
+/** An optimized direct convolution 3x3 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note In case biases, -DHAS_BIAS must to be passed at compile
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution3x3_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 values0 = 0;
+    float4 values1 = 0;
+    float4 values2 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    // Note: Since each work-item computes 4x3 elements, we need to load 5 rows from the input tensor
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights
+        float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+        float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+        float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+        float4 src0;
+        float2 src1;
+
+        // Load values from row0 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 0 * src_stride_y) + 4);
+
+        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row0);
+
+        // Load values from row1 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 1 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row1);
+        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row0);
+
+        // Load values from row2 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 2 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(values0, src0, src1, weights_row2);
+        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row1);
+        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row0);
+
+        // Load values from row3 of input tensor
+        src0 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 3 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(values1, src0, src1, weights_row2);
+        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row1);
+
+        // Row4
+        src0 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y));
+        src1 = vload2(0, (__global float *)(src_addr + 4 * src_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x3_BIFROST(values2, src0, src1, weights_row2);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float bias = (float) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    values0 += (float4)bias;
+    values1 += (float4)bias;
+    values2 += (float4)bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+    vstore4(values2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
new file mode 100644
index 0000000000..59d668f0bf
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
@@ -0,0 +1,313 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X == 2 */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                               \
+    ({                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
+        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 8)                                                                                             \
+        src0 = vload8(0, src_row_ptr);                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        src1 = vload4(0, src_row_ptr + 8);                                                                                      \
+        \
+        acc += src0 * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                          \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1234, src0.s567, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s234, src0.s567, src1.s01) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s345, src0.s67, src1.s012) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s45, src0.s67, src1.s0123) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
+    })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                               \
+    ({                                                                                                                          \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        weights_values0          = vload4(0, weights_row_ptr);                                                                  \
+        DATA_TYPE weights_value1 = *(weights_row_ptr + 4);                                                                      \
+        VEC_DATA_TYPE(DATA_TYPE, 16)                                                                                            \
+        src0 = vload16(0, src_row_ptr);                                                                                         \
+        VEC_DATA_TYPE(DATA_TYPE, 4)                                                                                             \
+        src1 = vload4(0, src_row_ptr + 16);                                                                                     \
+        acc += src0.even * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s0;                                                     \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s1357, src0.s9BDF) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s1;         \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s2468, src0.sACE, src1.s0) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s2; \
+        \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s3579, src0.sBDF, src1.s1) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_values0.s3; \
+        acc += (VEC_DATA_TYPE(DATA_TYPE, 8))(src0.s468a, src0.sCE, src1.s02) * (VEC_DATA_TYPE(DATA_TYPE, 8))weights_value1;     \
+    })
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    values0 = 0;
+
+    __global uchar *weights_addr = (__global uchar *)tensor3D_offset(&weights, 0, 0, 0);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    values0 += (VEC_DATA_TYPE(DATA_TYPE, 8)) * ((__global DATA_TYPE *)(vector_offset(&biases, kernel_index)));
+#endif /* defined(HAS_BIAS) */
+
+    vstore8(values0, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+
+#if defined(WEIGHTS_DEPTH)
+
+#define CONVOLUTION1x5_BIFROST(acc, src0, weights_row00, weights_row01) \
+    ({                                                                  \
+        acc.s0 = mad(src0.s0, weights_row00.s0, acc.s0);                \
+        acc.s1 = mad(src0.s1, weights_row00.s0, acc.s1);                \
+        acc.s2 = mad(src0.s2, weights_row00.s0, acc.s2);                \
+        acc.s3 = mad(src0.s3, weights_row00.s0, acc.s3);                \
+        acc.s0 = mad(src0.s1, weights_row00.s1, acc.s0);                \
+        acc.s1 = mad(src0.s2, weights_row00.s1, acc.s1);                \
+        acc.s2 = mad(src0.s3, weights_row00.s1, acc.s2);                \
+        acc.s3 = mad(src0.s4, weights_row00.s1, acc.s3);                \
+        acc.s0 = mad(src0.s2, weights_row00.s2, acc.s0);                \
+        acc.s1 = mad(src0.s3, weights_row00.s2, acc.s1);                \
+        acc.s2 = mad(src0.s4, weights_row00.s2, acc.s2);                \
+        acc.s3 = mad(src0.s5, weights_row00.s2, acc.s3);                \
+        acc.s0 = mad(src0.s3, weights_row00.s3, acc.s0);                \
+        acc.s1 = mad(src0.s4, weights_row00.s3, acc.s1);                \
+        acc.s2 = mad(src0.s5, weights_row00.s3, acc.s2);                \
+        acc.s3 = mad(src0.s6, weights_row00.s3, acc.s3);                \
+        acc.s0 = mad(src0.s4, weights_row01, acc.s0);                   \
+        acc.s1 = mad(src0.s5, weights_row01, acc.s1);                   \
+        acc.s2 = mad(src0.s6, weights_row01, acc.s2);                   \
+        acc.s3 = mad(src0.s7, weights_row01, acc.s3);                   \
+    })
+
+/** An optimized direct convolution 5x5 OpenCL kernel for Bifrost architectures when the data type is F32
+ *
+ * @note This OpenCL kernel works only with stride_x and stride_y equal to 1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: F32
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Same as @p src_ptr
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution5x5_f32_bifrost(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    // Get the kernel index
+    const int kernel_index = get_global_id(2);
+
+    Image    src = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    float4 values0 = 0.0f;
+    float4 values1 = 0.0f;
+
+    __global uchar *weights_addr = (__global uchar *)(weights_ptr + weights_offset_first_element_in_bytes + kernel_index * weights_stride_w);
+    __global uchar *src_addr     = (__global uchar *)offset(&src, 0, 0);
+
+    // Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor
+
+    for(ushort d = 0; d < (ushort)WEIGHTS_DEPTH; ++d)
+    {
+        // Load the weights from row0 and row1
+        float4 weights_row00 = vload4(0, (__global float *)(weights_addr + 0 * weights_stride_y));
+        float  weights_row01 = *((__global float *)(weights_addr + 0 * weights_stride_y) + 4);
+        float4 weights_row10 = vload4(0, (__global float *)(weights_addr + 1 * weights_stride_y));
+        float  weights_row11 = *((__global float *)(weights_addr + 1 * weights_stride_y) + 4);
+        float8 src0;
+
+        // Load values from row0 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 0 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+
+        // Load values from row1 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 1 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+        // Load values from row2 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 2 * src_stride_y));
+
+        // Load weights from row2
+        weights_row00 = vload4(0, (__global float *)(weights_addr + 2 * weights_stride_y));
+        weights_row01 = *((__global float *)(weights_addr + 2 * weights_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
+
+        // Load values from row3 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 3 * src_stride_y));
+
+        // Load weights from row3
+        weights_row10 = vload4(0, (__global float *)(weights_addr + 3 * weights_stride_y));
+        weights_row11 = *((__global float *)(weights_addr + 3 * weights_stride_y) + 4);
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row10, weights_row11);
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+        // Load values from row4 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 4 * src_stride_y));
+
+        // Load weights from row4
+        weights_row00 = vload4(0, (__global float *)(weights_addr + 4 * weights_stride_y));
+        weights_row01 = *((__global float *)(weights_addr + 4 * weights_stride_y) + 4);
+
+        CONVOLUTION1x5_BIFROST(values0, src0, weights_row00, weights_row01);
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row10, weights_row11);
+
+        // Load values from row5 of input tensor
+        src0 = vload8(0, (__global float *)(src_addr + 5 * src_stride_y));
+
+        // Accumulate
+        CONVOLUTION1x5_BIFROST(values1, src0, weights_row00, weights_row01);
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+
+    float4 bias = (float4) * ((__global float *)(vector_offset(&biases, kernel_index)));
+
+    values0 += bias;
+    values1 += bias;
+#endif /* defined(HAS_BIAS) */
+
+    vstore4(values0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));
+    vstore4(values1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));
+}
+#endif // defined(WEIGHTS_DEPTH)
diff --git a/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
new file mode 100644
index 0000000000..b80d4f587e
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+
+#undef CONVERT_SAT_STR
+#undef CONVERT_SAT
+
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
+
+#define CONVERT_SAT_STR(x, type) (convert_##type##8_sat((x)))
+#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+
+#if KERNEL_SIZE == 9
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x9(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x9_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                             \
+    ({                                                                                                        \
+        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                     \
+        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                          \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                       \
+        acc += (src0.lo + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                        \
+        acc += ((int8)(src0.s1234, src0.s5678) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s2345, src0.s6789) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s3456, src0.s789A) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s4567, src0.s89AB) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s5678, src0.s9ABC) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s6789, src0.sABCD) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s789A, src0.sBCDE) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s89AB, src0.sCDEF) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
+    })
+
+#define CONVOLUTION1x9_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                     \
+    ({                                                                                                                \
+        int8  weights_values0 = convert_int8(vload8(0, weights_row_ptr));                                             \
+        int   weights_value1  = convert_int(*(weights_row_ptr + 8));                                                  \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                               \
+        int8  src1            = convert_int8(vload8(0, src_row_ptr + 16));                                            \
+        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                              \
+        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);         \
+        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s468A, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_values0.s4 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s579B, src0.sDF, src1.s13) + INPUT_OFFSET) * ((int8)weights_values0.s5 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s68AC, src0.sE, src1.s024) + INPUT_OFFSET) * ((int8)weights_values0.s6 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s79BD, src0.sF, src1.s135) + INPUT_OFFSET) * ((int8)weights_values0.s7 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s8ACE, src1.s0246) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);             \
+    })
+
+#elif KERNEL_SIZE == 5
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x5(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x5_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                     \
+    ({                                                                                                                \
+        int4 weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                              \
+        int  weights_value1  = convert_int(*(weights_row_ptr + 4));                                                   \
+        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                  \
+        int4 src1            = convert_int4(vload4(0, src_row_ptr + 8));                                              \
+        acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                                   \
+        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s345, src0.s67, src1.s012) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s45, src0.s67, src1.s0123) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
+    })
+
+#define CONVOLUTION1x5_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                     \
+    ({                                                                                                                \
+        int4  weights_values0 = convert_int4(vload4(0, weights_row_ptr));                                             \
+        int   weights_value1  = convert_int(*(weights_row_ptr + 4));                                                  \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                               \
+        int4  src1            = convert_int4(vload4(0, src_row_ptr + 16));                                            \
+        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                              \
+        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);         \
+        acc += ((int8)(src0.s2468, src0.sACE, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s3579, src0.sBDF, src1.s1) + INPUT_OFFSET) * ((int8)weights_values0.s3 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s468a, src0.sCE, src1.s02) + INPUT_OFFSET) * ((int8)weights_value1 + WEIGHTS_OFFSET);     \
+    })
+
+#elif KERNEL_SIZE == 3
+
+#if STRIDE_X == 1
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)
+#elif STRIDE_X == 2
+#define CONVOLUTION1x3(acc, src_row_ptr, weights_row_ptr) CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)
+#else /* STRIDE_X not equals 1 or 2 */
+#error "STRIDE_X larger than 2 is not supported"
+#endif /* STRIDE_X */
+
+#define CONVOLUTION1x3_STRIDE1(acc, src_row_ptr, weights_row_ptr)                                                     \
+    ({                                                                                                                \
+        int3 weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                              \
+        int8 src0            = convert_int8(vload8(0, src_row_ptr));                                                  \
+        int2 src1            = convert_int2(vload2(0, src_row_ptr + 8));                                              \
+        acc += (src0 + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                                   \
+        acc += ((int8)(src0.s1234, src0.s567, src1.s0) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET); \
+        acc += ((int8)(src0.s234, src0.s567, src1.s01) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+    })
+
+#define CONVOLUTION1x3_STRIDE2(acc, src_row_ptr, weights_row_ptr)                                                  \
+    ({                                                                                                             \
+        int3  weights_values0 = convert_int3(vload3(0, weights_row_ptr));                                          \
+        int16 src0            = convert_int16(vload16(0, src_row_ptr));                                            \
+        int   src1            = convert_int(*(src_row_ptr + 16));                                                  \
+        acc += (src0.even + INPUT_OFFSET) * ((int8)weights_values0.s0 + WEIGHTS_OFFSET);                           \
+        acc += ((int8)(src0.s1357, src0.s9BDF) + INPUT_OFFSET) * ((int8)weights_values0.s1 + WEIGHTS_OFFSET);      \
+        acc += ((int8)(src0.s2468, src0.sACE, src1) + INPUT_OFFSET) * ((int8)weights_values0.s2 + WEIGHTS_OFFSET); \
+    })
+
+#elif KERNEL_SIZE == 1
+
+#if STRIDE_X == 3
+#define INPUT_VALUE extract_input_stride3
+#elif STRIDE_X == 2
+#define INPUT_VALUE extract_input_stride2
+#elif STRIDE_X == 1
+#define INPUT_VALUE extract_input_stride1
+
+#else /* STRIDE_X not equals 1, 2 or 3 */
+#error "Only support strides 1, 2 and 3"
+#endif /* STRIDE_X */
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 1.
+ *
+ * @param[in] input_value Pointer to the first value.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride1(__global const DATA_TYPE *input_value)
+{
+    return vload8(0, input_value);
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 2.
+ *
+ * @param[in] input_value Pointer to the first value.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride2(__global const DATA_TYPE *input_value)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp = vload16(0, input_value);
+    return temp.s02468ace;
+}
+
+/** Extracts a 1D horizontal vector from the input tensor with stride as 3 and 8-bit data size.
+ *
+ * @param[in] input_value Pointer to the first value.
+ *
+ * @return extracted input values.
+ */
+inline VEC_DATA_TYPE(DATA_TYPE, 8) extract_input_stride3(__global const DATA_TYPE *input_value)
+{
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp1 = vload16(0, input_value);
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    temp2 = vload16(0, input_value + 12);
+    return (VEC_DATA_TYPE(DATA_TYPE, 8))(temp1.s0369, temp2.s0369);
+}
+
+#else /* KERNEL_SIZE not equals 1, 3 , 5, 9 */
+#error "Only kernel sizes 1, 3, 5 and 9 are supported"
+#endif /* KERNEL_SIZE */
+
+/** This kernel performs a direct convolution to convolve the low three dimensions.
+ *
+ * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
+ * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
+ * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
+ * @note The input offset quantization parameter must be passed at compile time using -DINPUT_OFFSET e.g. -DINPUT_OFFSET=3
+ * @note The weights offset quantization parameter must be passed at compile time using -DWEIGHTS_OFFSET e.g. -DWEIGHTS_OFFSET=3
+ * @note The destination offset quantization parameter must be passed at compile time using -DOUTPUT_OFFSET e.g. -DOUTPUT_OFFSET=3
+ *
+ * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                            src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                          Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                            src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                          Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                            src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes     The offset of the first element in the source tensor
+ * @param[out] dst_ptr                               Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                          Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                            dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                          Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                            dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                          Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                            dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes     The offset of the first element in the destination tensor
+ * @param[in]  weights_ptr                           Pointer to the weights tensor. Supported data types: same as @p src_ptr
+ * @param[in]  weights_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  weights_step_x                        weights_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  weights_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  weights_step_y                        weights_stride_y * number of elements along y processed per workitem(in bytes)
+ * @param[in]  weights_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  weights_step_z                        weights_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in]  biases_ptr                            Pointer to the biases tensor. Supported data types: S32
+ * @param[in]  biases_stride_x                       Stride of the biases tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                         biases_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes  The offset of the first element in the biases tensor
+ * @param[in]  weights_stride_w                      Stride of the weights tensor in the 4th dimension
+ */
+__kernel void direct_convolution_quantized(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    TENSOR3D_DECLARATION(weights),
+#ifdef HAS_BIAS
+    VECTOR_DECLARATION(biases),
+#endif /* defined(HAS_BIAS) */
+    unsigned int weights_stride_w)
+{
+    Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
+    Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
+    Tensor3D dst     = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int8 values0 = 0;
+
+    __global DATA_TYPE *weights_addr = (__global DATA_TYPE *)tensor3D_offset(&weights, 0, 0, 0);
+    __global DATA_TYPE *src_addr     = (__global DATA_TYPE *)offset(&src, 0, 0);
+
+    const int kernel_index = get_global_id(2);
+    weights_addr += kernel_index * weights_stride_w;
+
+    for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d)
+    {
+#if KERNEL_SIZE == 9
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 5 * weights_stride_y));
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 6 * weights_stride_y));
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 7 * weights_stride_y));
+        CONVOLUTION1x9(values0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 8 * weights_stride_y));
+#elif KERNEL_SIZE == 5
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)src_addr, (__global DATA_TYPE *)weights_addr);
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 3 * weights_stride_y));
+        CONVOLUTION1x5(values0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 4 * weights_stride_y));
+#elif KERNEL_SIZE == 3
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y));
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y));
+        CONVOLUTION1x3(values0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y), (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y));
+#elif KERNEL_SIZE == 1
+        int weight       = convert_int(*(__global DATA_TYPE *)weights_addr);
+        int8 input_value = convert_int8(INPUT_VALUE((__global DATA_TYPE *)src_addr));
+        values0 += (input_value + INPUT_OFFSET) * ((int8)weight + WEIGHTS_OFFSET);
+#endif /* (KERNEL_SIZE == 1) || (KERNEL_SIZE == 3) || (KERNEL_SIZE == 5) */
+
+        src_addr += src_stride_z;
+        weights_addr += weights_stride_z;
+    }
+
+#ifdef HAS_BIAS
+    Vector        biases    = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
+    __global int *bias_addr = ((__global int *)(vector_offset(&biases, kernel_index)));
+    values0 += (int8)(*bias_addr);
+#endif /* defined(HAS_BIAS) */
+
+#if OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_SHIFT < 0
+    values0 = values0 + OUTPUT_OFFSET;
+
+    vstore8(CONVERT_SAT(values0, DATA_TYPE), 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/nchw/im2col.cl b/src/core/CL/cl_kernels/nchw/im2col.cl
new file mode 100644
index 0000000000..fddf918c63
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/im2col.cl
@@ -0,0 +1,863 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 1x1, the stride_x = 1 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col1x1_stridex1_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const uint xc    = get_global_id(0) * 4;         // x coordinate in the convolved tensor
+    const uint yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const uint ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const uint batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Clamp xc
+    // The strategy clamps at "xc" as it will be a valid value for sure
+    uint4 xc_clamped = xc + (uint4)(0, 1, 2, 3);
+
+    // Check which values are valid
+    const VEC_DATA_TYPE(COND_DATA_TYPE, 4) cond0 = CONVERT((xc_clamped < SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+
+    xc_clamped = select((uint4)xc, xc_clamped, convert_int4(cond0));
+
+    // Calculate input indices
+    const uint xi = xc;
+    const uint yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+
+#if defined(NUM_GROUPS)
+    const uint xo = ch % (SRC_DEPTH / NUM_GROUPS);
+    const uint zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                                   // defined(NUM_GROUPS)
+    const uint xo              = ch;
+#endif                                                  // defined(NUM_GROUPS)
+    const uint4 yo = xc_clamped + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    data = vload4(0, (__global DATA_TYPE *)input_ptr);
+
+    // If out-of-bound, overwrite with the first element
+    data = select((VEC_DATA_TYPE(DATA_TYPE, 4))data.s0, data, cond0);
+
+    *(__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) = data.s0;
+    *(__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) = data.s1;
+    *(__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) = data.s2;
+    *(__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) = data.s3;
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if(xo == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)(output_ptr + yo.s0 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s1 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s2 * dst_stride_y) + 1) = 1.0f;
+        *((__global DATA_TYPE *)(output_ptr + yo.s3 * dst_stride_y) + 1) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+#if defined(DILATION_X) && defined(DILATION_Y)
+/** This opencl kernel performs a generic im2col implementation when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else  // defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
+    // Linearize convolution elements
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        int y = yi + yk * DILATION_Y;
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
+        {
+            int x = xi + xk * DILATION_X;
+#if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+#else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+            if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)
+            {
+                *output_ptr = PAD_VALUE;
+            }
+            else
+            {
+                *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            }
+#endif // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
+        }
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *output_ptr = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(DILATION_X) && defined(DILATION_Y)
+
+/** This opencl kernel performs im2col when the kernel size is 3x3 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 9; // 3x3
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 9; // 3x3
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row0 = vload3(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row1 = vload3(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    row2 = vload3(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    int3 x = (int3)xi + (int3)(0, 1, 2);
+    int3 y = (int3)yi + (int3)(0, 1, 2);
+
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond0 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s0 >= 0 && y.s0 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond1 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s1 >= 0 && y.s1 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 3)
+    cond2 = CONVERT((x >= (int3)0 && x < (int3)SRC_WIDTH && (int3)(y.s2 >= 0 && y.s2 < SRC_HEIGHT)), VEC_DATA_TYPE(COND_DATA_TYPE, 3));
+
+    row0 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row0, cond0);
+    row1 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row1, cond1);
+    row2 = select((VEC_DATA_TYPE(DATA_TYPE, 3))PAD_VALUE, row2, cond2);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+    vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row0.s012, row1.s012, row2.s01), 0, (__global DATA_TYPE *)output_ptr);
+    *((__global DATA_TYPE *)output_ptr + 8) = row2.s2;
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 9) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr + 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+/** This opencl kernel performs im2col when the kernel size is 5x5 and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col5x5_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X - PAD_LEFT;
+    const int yi = yc * STRIDE_Y - PAD_TOP;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 25; // 5x5
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 25; // 5x5
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+    // Put 0 if the value is out-of-bound
+    int4 x0 = (int4)xi + (int4)(0, 1, 2, 3);
+    int4 y0 = (int4)yi + (int4)(0, 1, 2, 3);
+    int  x1 = xi + 4;
+    int  y1 = yi + 4;
+
+    // Check if we could have out-of-bounds elements in the x direction
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    x0_condition = CONVERT((x0 >= (int4)0 && x0 < (int4)SRC_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+    VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+    y0_condition                = CONVERT((y0 >= (int4)0 && y0 < (int4)SRC_HEIGHT), VEC_DATA_TYPE(COND_DATA_TYPE, 4));
+    COND_DATA_TYPE x1_condition = (COND_DATA_TYPE)(x1 >= 0 && x1 < SRC_WIDTH);
+    COND_DATA_TYPE y1_condition = (COND_DATA_TYPE)(y1 >= 0 && y1 < SRC_HEIGHT);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * (int)src_stride_x + yi * (int)src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s0;
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s1;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s0);
+        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s1);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+                                              row10.s012),
+                0, (__global DATA_TYPE *)output_ptr);
+        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 10 * dst_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row10 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row11 = *((__global DATA_TYPE *)input_ptr + 4);
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00 = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s2;
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond10                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y0_condition.s3;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y0_condition.s2);
+        COND_DATA_TYPE cond11 = (COND_DATA_TYPE)(x1_condition && y0_condition.s3);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row10 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row10, cond10);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+        row11 = select((DATA_TYPE)PAD_VALUE, row11, cond11);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s0123, row01,
+                                              row10.s012),
+                0, (__global DATA_TYPE *)output_ptr);
+        vstore2((VEC_DATA_TYPE(DATA_TYPE, 2))(row10.s3, row11), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 10 * dst_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 4)
+        row00 = vload4(0, (__global DATA_TYPE *)input_ptr);
+        DATA_TYPE
+        row01 = *((__global DATA_TYPE *)input_ptr + 4);
+
+        input_ptr += src_stride_y;
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+        VEC_DATA_TYPE(COND_DATA_TYPE, 4)
+        cond00                = x0_condition && (VEC_DATA_TYPE(COND_DATA_TYPE, 4))y1_condition;
+        COND_DATA_TYPE cond01 = (COND_DATA_TYPE)(x1_condition && y1_condition);
+
+        // Replace with 0 if the value is not valid
+        row00 = select((VEC_DATA_TYPE(DATA_TYPE, 4))PAD_VALUE, row00, cond00);
+        row01 = select((DATA_TYPE)PAD_VALUE, row01, cond01);
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+        vstore4(row00, 0, (__global DATA_TYPE *)output_ptr);
+        *((__global DATA_TYPE *)output_ptr + 4) = row01;
+
+        output_ptr += 5 * dst_stride_x;
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 25) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+/** This opencl kernel performs im2col when the kernel size is 11x11, we do not have paddings and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The number of input channels must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col11x11_padx0_pady0_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * 121; // 11x11
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo               = ch * 121; // 11x11
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    // Get input and output address
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + xi * src_stride_x + yi * src_stride_y + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w;
+#else  // defined(NUM_GROUPS)
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + xo * dst_stride_x + yo * dst_stride_y + batch * dst_stride_w;
+#endif // defined(NUM_GROUPS)
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        input_ptr += src_stride_y;
+        output_ptr += 11 * src_stride_x;
+    }
+
+    {
+        VEC_DATA_TYPE(DATA_TYPE, 8)
+        row00 = vload8(0, (__global DATA_TYPE *)(input_ptr));
+        VEC_DATA_TYPE(DATA_TYPE, 3)
+        row01 = vload3(0, (__global DATA_TYPE *)(input_ptr) + 8);
+
+        vstore8((VEC_DATA_TYPE(DATA_TYPE, 8))(row00.s01234567), 0, (__global DATA_TYPE *)output_ptr);
+        vstore3((VEC_DATA_TYPE(DATA_TYPE, 3))(row01.s012), 0, (__global DATA_TYPE *)output_ptr + 8);
+
+        output_ptr += 11 * src_stride_x;
+    }
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / 121) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *((__global DATA_TYPE *)output_ptr) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(SRC_DEPTH)
+
+#if defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+/** This opencl kernel performs im2col when the kernel size is greater than 1x1, we do not have paddings and the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=4.
+ * @note The width modulo vector size must be passed at compile time using -DWIDTH_MOD_VECTOR_SIZE e.g. -DWIDTH_MOD_VECTOR_SIZE=3.
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ * @note In case grouping is performed, the number of groups must be passed at compile time using -DNUM_GROUPS: e.g. -DNUM_GROUPS=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_padx0_pady0_nchw(
+    TENSOR3D_DECLARATION(src),
+#if defined(NUM_GROUPS)
+    TENSOR3D_DECLARATION(dst),
+#else  // defined(NUM_GROUPS)
+    IMAGE_DECLARATION(dst),
+#endif // defined(NUM_GROUPS)
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int xc    = get_global_id(0);             // x coordinate in the convolved tensor
+    const int yc    = get_global_id(1);             // y coordinate in the convolved tensor
+    const int ch    = get_global_id(2) % SRC_DEPTH; // input feature map
+    const int batch = get_global_id(2) / SRC_DEPTH; // batch size
+
+    // Calculate input indices
+    const int xi = xc * STRIDE_X;
+    const int yi = yc * STRIDE_Y;
+
+    // Calculate output indices
+#if defined(NUM_GROUPS)
+    const int xo = (ch % (SRC_DEPTH / NUM_GROUPS)) * KERNEL_WIDTH * KERNEL_HEIGHT;
+    const int zo = ch / (SRC_DEPTH / NUM_GROUPS);
+#else                                         // defined(NUM_GROUPS)
+    const int xo                   = ch * KERNEL_WIDTH * KERNEL_HEIGHT;
+#endif                                        // defined(NUM_GROUPS)
+    const int yo = xc + yc * CONVOLVED_WIDTH; // Index of the convolution
+
+    __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * src_stride_z + batch * src_stride_w;
+#if defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + zo * dst_stride_z + batch * dst_stride_w)) + xo;
+#else  // defined(NUM_GROUPS)
+    __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
+#endif // defined(NUM_GROUPS)
+
+    // Linearize convolution elements
+    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    {
+        int last_x = 0;
+        for(int x = xi, x_e = xi + KERNEL_WIDTH; x + VECTOR_SIZE <= x_e; x += VECTOR_SIZE, output_ptr += VECTOR_SIZE)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+            row = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
+            VSTORE(VECTOR_SIZE)
+            (row, 0, output_ptr);
+            last_x = x;
+        }
+        // Copy the remainder of the row by doing VLOAD(WIDTH_MOD_VECTOR_SIZE) and VSTORE(WIDTH_MOD_VECTOR_SIZE).
+        // Note that x and output_ptr have already been incremented by VECTOR_SIZE by the loop just before exit.
+#if WIDTH_MOD_VECTOR_SIZE == 1
+        *output_ptr = *((__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+#elif WIDTH_MOD_VECTOR_SIZE > 1
+        VEC_DATA_TYPE(DATA_TYPE, WIDTH_MOD_VECTOR_SIZE)
+        row = VLOAD(WIDTH_MOD_VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + (last_x + VECTOR_SIZE) * src_stride_x + y * src_stride_y));
+        VSTORE(WIDTH_MOD_VECTOR_SIZE)
+        (row, 0, output_ptr);
+#endif /* WIDTH_MOD_VECTOR_SIZE */
+        output_ptr += WIDTH_MOD_VECTOR_SIZE;
+    } /* End of loop over KERNEL_HEIGHT */
+
+#ifdef HAS_BIAS
+#if defined(NUM_GROUPS)
+    if((xo / (KERNEL_WIDTH * KERNEL_HEIGHT)) == (SRC_DEPTH / NUM_GROUPS - 1))
+#else  // defined(NUM_GROUPS)
+    if(ch == (SRC_DEPTH - 1))
+#endif // defined(NUM_GROUPS)
+    {
+        *output_ptr = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
new file mode 100644
index 0000000000..0fef98e295
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(NUM_SLICES)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
+                                                 TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+    const int current_slice = get_global_id(2);
+    const int left_slice    = max(-(int)RADIUS, -current_slice);
+    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
+
+    for(int i = left_slice; i <= right_slice; i++)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
+    }
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    VSTORE(VEC_SIZE)
+    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
+
+#if defined(WIDTH_SIZE)
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int current_col = get_global_id(0) << 2;
+    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
+    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
+
+#if defined(IN_MAP_2D)
+    const int current_row = get_global_id(1);
+    const int first_row   = max(-(int)RADIUS, -current_row);
+    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#endif /* defined(IN_MAP_2D) */
+        for(int i = left_pos; i <= right_pos; ++i)
+        {
+#if defined(IN_MAP_2D)
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
+#else  /* defined(IN_MAP_2D) */
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
+#endif /* defined(IN_MAP_2D) */
+            acc = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+    VSTORE(VEC_SIZE)
+    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(WIDTH_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..23a0de76f7
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
+    const DATA_TYPE curr_std  = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+    TYPE res  = (data - curr_mean) / curr_std;
+
+    VSTORE(VEC_SIZE)
+    (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..0f02ef6184
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+#if defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
+    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
+
+    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
+    data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+
+    // Perform normalization
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    VSTORE(VEC_SIZE)
+    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#endif // defined(NUM_CHANNELS)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
new file mode 100644
index 0000000000..790ddb381a
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a pooling function of pool size equal to N  (NCHW)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+    vdata               = INITIAL_VALUE;
+    ACC_DATA_TYPE sdata = INITIAL_VALUE;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE_Y; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+        {
+            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            vdata = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE_X; ++x)
+        {
+            ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif /* defined(POOL_L2) */
+            sdata = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
+    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+    res               = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    // Divide by pool region in case of average pooling
+    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+    // Store result
+    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
+{
+    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+    const int pad_vert  = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
+
+    const int x = get_global_id(0) * STRIDE_X;
+    const int y = get_global_id(1) * STRIDE_Y;
+    const int z = get_global_id(2);
+
+    //x axis: width, y axis: height, z axis: component
+    const uint padded_offset = input->offset_first_element_in_bytes
+                               + x * input->stride_x
+                               + y * input->stride_y
+                               + z * input->stride_z;
+
+    const uint offset_base = padded_offset
+                             - y * pad_horiz * sizeof(DATA_TYPE)                                               /* Horizontal padding for each row */
+                             - PAD_TENSOR_TOP * input->stride_y                                                /* top padding */
+                             - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
+                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
+    *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
+#else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+    *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
+#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+
+    *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+    return;
+}
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp32(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    // Load data
+    float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
+    float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    float data0_max = POOL_OP(data0.s0, data0.s1);
+    float data1_max = POOL_OP(data1.s0, data1.s1);
+    float res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global float *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_top    = 0;
+    uint offset_bottom = 0;
+
+    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp16(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    TENSOR3D_DECLARATION(indices))
+{
+    // Get pixels pointer
+    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
+    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+    // Load data
+    half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
+    half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+
+    // Perform calculations
+    half data0_max = POOL_OP(data0.s0, data0.s1);
+    half data1_max = POOL_OP(data1.s0, data1.s1);
+    half res       = POOL_OP(data0_max, data1_max);
+    // Store result
+    *(__global half *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+    uint offset_top    = 0;
+    uint offset_bottom = 0;
+
+    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+    *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..1440ef3ed1
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+    int       start_x = get_global_id(0) * stride_x - pad_x;
+    int       start_y = get_global_id(1) * stride_y - pad_y;
+    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
+    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+    start_x = max(0, start_x);
+    start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+    return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NCHW)
+ *
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nchw(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output))
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int8 vdata = INITIAL_VALUE;
+    int  sdata = INITIAL_VALUE;
+
+    // Load data
+    for(int y = 0; y < POOL_SIZE_Y; y++)
+    {
+        int x = 0;
+        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+        {
+            VEC_TYPE(8)
+            data       = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            int8 data0 = convert_int8(data);
+            vdata      = POOL_OP(vdata, data0);
+        }
+
+        // Leftover
+        for(; x < (int)POOL_SIZE_X; ++x)
+        {
+            DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+            int data0      = convert_int(data);
+            sdata          = POOL_OP(sdata, data0);
+        }
+    }
+
+    // Reduce result
+    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
+    res          = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
+#endif /* defined(POOL_AVG) */
+
+    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+    const float result_f32   = convert_float(result_q8);
+    const float input_offset = (float)OFFSET_IN1;
+    const float input_scale  = (float)SCALE_IN1;
+    const float scale_out    = (float)SCALE_OUT;
+    const float offset_out   = (float)OFFSET_OUT;
+    const float in_f32       = (result_f32 - input_offset) * input_scale;
+    const float out_f32      = in_f32 / scale_out + offset_out;
+    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    *(__global DATA_TYPE *)output.ptr = result_q8;
+}
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/prior_box_layer.cl b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
new file mode 100644
index 0000000000..7524ba7b4a
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3)
+
+/**  Compute prior boxes and clip (NCHW)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  idx                                  Index to write to
+ * @param[in]  center_x                             Center value of the x axis
+ * @param[in]  center_y                             Center value of the y axis
+ * @param[in]  box_width                            Prior box width
+ * @param[in]  box_height                           Prior box height
+ *
+ */
+inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height)
+{
+    float xmin = (center_x - box_width / 2.f) / WIDTH;
+    float ymin = (center_y - box_height / 2.f) / HEIGHT;
+    float xmax = (center_x + box_width / 2.f) / WIDTH;
+    float ymax = (center_y + box_height / 2.f) / HEIGHT;
+
+#if defined(CLIP)
+    xmin = clamp(xmin, 0.f, 1.f);
+    ymin = clamp(ymin, 0.f, 1.f);
+    xmax = clamp(xmax, 0.f, 1.f);
+    ymax = clamp(ymax, 0.f, 1.f);
+#endif // defined(CLIP)
+
+    // Store result
+    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0)));
+}
+
+/** Compute prior boxes (NCHW)
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min_size                             Prior box min size
+ * @param[in]  min_idx                              Index of the min vector
+ * @param[in]  idx                                  Index to write to
+ *
+ * @return The updated index
+ */
+inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
+{
+    const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X;
+    const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y;
+
+    float box_width  = min_size;
+    float box_height = min_size;
+    calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+    idx += 4;
+
+    if(max_size > 0)
+    {
+        box_width  = sqrt(min_size * max[min_idx]);
+        box_height = box_width;
+        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
+    {
+        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
+        {
+            continue;
+        }
+        box_width  = min_size * sqrt(aspect_ratios[i]);
+        box_height = min_size * rsqrt(aspect_ratios[i]);
+
+        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
+        idx += 4;
+    }
+
+    return idx;
+}
+/** Calculate prior boxes with NCHW format.
+ *
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  min                                  The minimum values
+ * @param[in]  max                                  The maximum_values
+ * @param[in]  aspect_ratios                        The aspect ratio values
+ * @param[in]  min_size                             The minimum values size
+ * @param[in]  max_size                             The maximum_values values size
+ * @param[in]  aspect_ratios_size                   The aspect ratio values size
+ */
+__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
+                                   unsigned int aspect_ratios_size)
+{
+    Image out = CONVERT_TO_IMAGE_STRUCT(output);
+
+    int idx = 0;
+    for(unsigned int i = 0; i < min_size; ++i)
+    {
+        idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
+    }
+
+    // Store variances
+    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
+    {
+        vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
+    }
+}
+#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
diff --git a/src/core/CL/cl_kernels/nchw/remap.cl b/src/core/CL/cl_kernels/nchw/remap.cl
new file mode 100644
index 0000000000..fab88a1682
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/remap.cl
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#ifndef DEPTH_OUT
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_nearest_neighbour_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ */
+__kernel void remap_bilinear_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    IMAGE_DECLARATION(mapx),
+    IMAGE_DECLARATION(mapy),
+    const float width,
+    const float height)
+{
+    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
+    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
+}
+#endif // DEPTH_OUT
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/reorg_layer.cl b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
new file mode 100644
index 0000000000..f66b17c1a6
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
+    ({                                                        \
+        int offset = zo / (int)SRC_DEPTH;                     \
+        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
+        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
+        zi         = zo % SRC_DEPTH;                          \
+    })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(0);
+    int yo = get_global_id(1);
+    int zo = get_global_id(2);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset                   = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl
new file mode 100644
index 0000000000..63a53cc4f2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale.cl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = in_x_coords * (float4)(scale.s0);
+    const float4 new_y       = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_nearest_neighbour_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r           = (float2)(scale_x, scale_y);
+    float8       transformed = transform_nearest(get_current_coords(), r);
+#ifdef ALIGN_CORNERS
+    transformed = round(transformed);
+#endif // ALIGN_CORNERS
+    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
+    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = transform_bilinear(get_current_coords(), r);
+    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
new file mode 100644
index 0000000000..946ad65c14
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+#include "warp_helpers_quantized.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
+{
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 new_x = in_x_coords * (float4)(scale.s0);
+    const float4 new_y = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nchw(
+    IMAGE_DECLARATION(in),
+    IMAGE_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
+    const float2 r   = (float2)(scale_x, scale_y);
+    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
+    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/space_to_batch.cl b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
new file mode 100644
index 0000000000..e162a29bb0
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nchw(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(0);
+    const int out_y = get_global_id(1);
+    const int z     = get_global_id(2);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/nchw/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
new file mode 100644
index 0000000000..aea02e813b
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nchw(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/upsample_layer.cl b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
new file mode 100644
index 0000000000..723c491165
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
new file mode 100644
index 0000000000..85eff9e6d9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SRC_DIM_Z)
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0);
+    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+    out0.s3 = (w0.s2);
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out1    = 0.0f;
+    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out2    = 0.0f;
+    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out3    = 0.0f;
+    out3.s0 = (w2.s0);
+    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+    out3.s3 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 16 channels for 3x3 kernels
+    // 4 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = (w0.s0) / 16.f;
+    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+    out0.s5 = (w0.s2) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (w2.s0) / 4.f;
+    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+    out5.s5 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 36 channels for 3x3 kernels
+    // 6 channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+    // Transform the input tile
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00.s0;
+    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+    out0.s7 = w01;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40.s0;
+    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+    out7.s7 = w41;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int z  = get_global_id(2);
+    int x0 = z / SRC_DIM_Z; // idx filter
+    int y0 = z % SRC_DIM_Z; // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+#endif // defined(SRC_DIM_Z)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
new file mode 100644
index 0000000000..8c382183c3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
@@ -0,0 +1,1346 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
+    ({                                                              \
+        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
+        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
+        comm_fact.s2 = 2.5f * tmp.s3;                               \
+        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
+        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
+        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+    })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
+    ({                                                                                             \
+        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
+        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
+        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
+        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
+        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
+        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
+        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
+        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+    })
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int z              = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    tmp0 -= in_row2;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
+    DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
+    DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
+    DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
+
+    DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
+    DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
+    DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
+    DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
+
+    DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
+    DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
+    DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
+    DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
+
+    DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
+    DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
+    DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
+    DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z))  = out10;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z))  = out11;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z))  = out12;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z))  = out13;
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out20;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out21;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = (get_global_id(2) * 2) % SRC_DEPTH;
+    const int b = (get_global_id(2) * 2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int       z        = get_global_id(2) * 2;
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    src_addr += src_stride_z;
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp4 = in_row4;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    tmp0 -= in_row2;
+    tmp4 -= in_row6;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp1 = in_row1 + in_row2;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp2 = in_row2 - in_row1;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp3 = in_row1 - in_row3;
+
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp5 = in_row5 + in_row6;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp6 = in_row6 - in_row5;
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    tmp7 = in_row5 - in_row7;
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
+
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
+    vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
+    vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
+    vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
+    vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
+    vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
+    vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
+    vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
+    vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
+    vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
+    vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
+    vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
+    vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
+    vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
+    vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int       z        = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                        *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row0
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d01                                        = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE out0 = 0.0f;
+    DATA_TYPE out1 = 0.0f;
+    DATA_TYPE out2 = 0.0f;
+    DATA_TYPE out3 = 0.0f;
+    DATA_TYPE out4 = 0.0f;
+    DATA_TYPE out5 = 0.0f;
+
+    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
+    out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
+    out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
+    out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
+    out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
+    out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
+    out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    // Row4
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+
+    // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
+    DATA_TYPE k0 = d41.s0;
+    DATA_TYPE k1 = d41.s0;
+    DATA_TYPE k2 = d41.s0;
+    DATA_TYPE k3 = d41.s0;
+    DATA_TYPE k4 = d41.s0;
+    DATA_TYPE k5 = 0.0f;
+
+    k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
+    k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
+    k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
+    k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
+    k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
+    k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
+
+    out0 += k0;
+    out1 += k1;
+    out2 += k2;
+    out3 += k3;
+    out4 += k4;
+    out5 += k5;
+
+    // Row2
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+
+    out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
+    out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
+    out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
+    out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
+    out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
+    out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Compute destination address
+#if defined(SRC_DEPTH)
+    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
+#else  /* defined(SRC_DEPTH) */
+    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
+#endif /* defined(SRC_DEPTH) */
+
+    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    DATA_TYPE out6  = k0;
+    DATA_TYPE out7  = k1;
+    DATA_TYPE out8  = k2;
+    DATA_TYPE out9  = k3;
+    DATA_TYPE out10 = k4;
+    DATA_TYPE out11 = k5;
+    DATA_TYPE out12 = k0;
+    DATA_TYPE out13 = k1;
+    DATA_TYPE out14 = k2;
+    DATA_TYPE out15 = k3;
+    DATA_TYPE out16 = k4;
+    DATA_TYPE out17 = k5;
+    DATA_TYPE out18 = k0;
+    DATA_TYPE out19 = k1;
+    DATA_TYPE out20 = k2;
+    DATA_TYPE out21 = k3;
+    DATA_TYPE out22 = k4;
+    DATA_TYPE out23 = k5;
+    DATA_TYPE out24 = k0;
+    DATA_TYPE out25 = k1;
+    DATA_TYPE out26 = k2;
+    DATA_TYPE out27 = k3;
+    DATA_TYPE out28 = k4;
+    DATA_TYPE out29 = k5;
+
+    // Row1
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+
+    // Row3
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+
+    // Compute common parts for the channels between [6, 29]
+    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
+    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
+    DATA_TYPE part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
+    DATA_TYPE part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
+    DATA_TYPE part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
+    DATA_TYPE part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
+    DATA_TYPE part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
+    DATA_TYPE part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
+    DATA_TYPE part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
+    DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
+    DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
+
+    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
+    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
+    DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
+    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
+    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
+    DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
+    DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
+    DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
+    DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
+    DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
+    DATA_TYPE part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
+    DATA_TYPE part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
+    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
+
+    out6 += part0 - part1;
+    out12 += part0 + part1;
+    out7 += part2 + part3 + part4 + part5;
+    out8 += part2 - part3 + part4 - part5;
+    out13 += part2 + part3 - part4 - part5;
+    out14 += part2 - part3 - part4 + part5;
+    out9 += part6 + part7 + part8 + part9;
+    out10 += part6 - part7 + part8 - part9;
+    out15 += part6 - part7 - part8 + part9;
+    out16 += part6 + part7 - part8 - part9;
+    out11 += part10 + part11;
+    out17 += part10 - part11;
+
+    out18 += part13 - part12;
+    out24 += part13 + part12;
+    out19 += part14 + part15 + part16 + part17;
+    out20 += part14 - part15 + part16 - part17;
+    out25 += part14 - part15 - part16 + part17;
+    out26 += part14 + part15 - part16 - part17;
+    out21 += part18 + part19 + part20 + part21;
+    out22 += part18 - part19 + part20 - part21;
+    out27 += part18 - part19 - part20 + part21;
+    out28 += part18 + part19 - part20 - part21;
+    out23 += part22 + part23;
+    out29 += part22 - part23;
+
+    *(dst_addr) = out6;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out7;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out8;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out9;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out10;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out11;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out12;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out13;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out14;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out15;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out16;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out17;
+    dst_addr += dst_plane_stride;
+
+    *(dst_addr) = out18;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out19;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out20;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out21;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out22;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out23;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out24;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out25;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out26;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out27;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out28;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out29;
+    dst_addr += dst_plane_stride;
+
+    // Row5
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+
+    // Channels [30, 35]
+    out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
+    out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
+
+    *(dst_addr) = out0;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out1;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out2;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out3;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out4;
+    dst_addr += dst_plane_stride;
+    *(dst_addr) = out5;
+    dst_addr += dst_plane_stride;
+#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+#if defined(SRC_DEPTH)
+    const int z = get_global_id(2) % SRC_DEPTH;
+    const int b = get_global_id(2) / SRC_DEPTH;
+#else  /* defined(SRC_DEPTH) */
+    const int                                z = get_global_id(2);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute input address
+#if defined(SRC_DEPTH)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
+#endif /* defined(SRC_DEPTH) */
+    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
+
+    // Load input tile
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
+                                                                              *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
+#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
+#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate common factors for intermediate tensor
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp0 = in_row0;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact0 = 0.0f;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
+    tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
+
+    comm_fact0 = (DATA_TYPE)2.5f * in_row3;
+    comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
+
+    comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
+    comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
+
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
+    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Calculate output rows (reuse comm_fact0 vector)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0;
+
+    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1, out2, out3, out4, out5, out6, out7;
+
+    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
+    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    // Store values across the channels
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
+#endif /* defined(SRC_DEPTH) */
+
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
+    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
+    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
+    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
+    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
+
+#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
+    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
+    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
+    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
+    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
+    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
+    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
+    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
+    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
+    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
+    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
+    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
+    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
+    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
+    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
+    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
+    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
+    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
+    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
+    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
+    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
+    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
+    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
+    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
+    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
+    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
+    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
+    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
+    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
+    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
+    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
+    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
+    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
+    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
+    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
+    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
+    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
+    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
+    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
+    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
+    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
+    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
+    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
+    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
+    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
+    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
+    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
+    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
+    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
+    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
+    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
+    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
+    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
+    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
+    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
+    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
+ *
+ * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
+ * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ */
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_offset_first_element_in_bytes,
+                                                 src_stride_w,
+                                                 dst_stride_w);
+}
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
new file mode 100644
index 0000000000..861ed50651
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
@@ -0,0 +1,1082 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute the 2x1 or 1x2 output tile
+    // out00 = d00 + d01 + d02
+    // out01 = d01 - d02 - d03
+
+    float out00 = d00 + d01 + d02;
+    float out01 = d01 - d02 - d03;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    // Compute the 2x2 output tile
+    float k0 = d01 + d11 + d21;
+    float k1 = d02 + d12 + d22;
+    float k2 = d11 - d21 - d31;
+    float k3 = d12 - d22 - d32;
+
+    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
+    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
+    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
+    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
+
+    float out00 = d10;
+    float out01 = -d13;
+    float out10 = d10;
+    float out11 = -d13;
+
+    out00 += d00 + d20 + k0 + k1;
+    out01 += k0 - k1 - (d03 + d23);
+    out10 += -d20 - d30 + k2 + k3;
+    out11 += k2 - k3 + d23 + d33;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    const VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+    // Add bias
+    out10 += (DATA_TYPE)b;
+    out11 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_3x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the 6x6 or 6x1 tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+
+    // Compute out00, out01, out02 and out03
+    float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
+    float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
+
+    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
+    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
+
+    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
+    out01 += k1 - d02 - d12 - d22 - d32 - d42;
+    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
+    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
+
+    // Compute out10, out11, out12 and out13
+    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
+
+    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
+
+    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
+    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
+    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
+    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
+
+    // Compute out20, out21, out22 and out23
+    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
+
+    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
+    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
+
+    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
+    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
+    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
+    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
+
+    // Compute out30, out31, out32 and out33
+    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
+
+    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
+    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
+
+    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
+    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
+    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
+    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (float)b;
+    out01 += (float)b;
+    out02 += (float)b;
+    out03 += (float)b;
+#endif // defined(HAS_BIAS)
+
+    // Get output address
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Store the output tile
+    const VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore4(out0_dt, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(HAS_BIAS)
+    // Add bias
+    out10 += (float)b;
+    out11 += (float)b;
+    out12 += (float)b;
+    out13 += (float)b;
+
+    out20 += (float)b;
+    out21 += (float)b;
+    out22 += (float)b;
+    out23 += (float)b;
+
+    out30 += (float)b;
+    out31 += (float)b;
+    out32 += (float)b;
+    out33 += (float)b;
+#endif // defined(HAS_BIAS)
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
+            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
+    ({                                                                   \
+        comm_fact.s0 = d1 + d2;                                          \
+        comm_fact.s1 = d3 + d4;                                          \
+        comm_fact.s2 = d5 + d6;                                          \
+        \
+        col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0;  \
+        col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
+        \
+        comm_fact.s0 = d1 - d2;                                          \
+        comm_fact.s1 = d3 - d4;                                          \
+        comm_fact.s2 = d5 - d6;                                          \
+        \
+        col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
+        col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7;  \
+    })
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    // Each thread stores a 4x4/4x1 or 1x4 tile
+#if defined(SRC_DEPTH)
+    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
+    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+#else  /* defined(SRC_DEPTH) */
+
+    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
+    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
+#endif /* defined(SRC_DEPTH) */
+
+    // Compute output address
+    int y_in  = get_global_id(1);
+    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
+    int z_out = get_global_id(0);
+#if defined(SRC_DEPTH)
+    int batch = get_global_id(2) / SRC_DEPTH;
+#endif /* defined(SRC_DEPTH) */
+
+#if defined(SRC_DEPTH)
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
+#else  /* defined(SRC_DEPTH) */
+
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
+#endif /* defined(SRC_DEPTH) */
+
+    // Load the values across the channels to compose the input tile
+    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Compute out00, out01, out02 and out03
+    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
+    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
+    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
+    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out00 += (DATA_TYPE)b;
+    out01 += (DATA_TYPE)b;
+    out02 += (DATA_TYPE)b;
+    out03 += (DATA_TYPE)b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL,
+                                 B_VAL),
+                      VEC_DATA_TYPE(DATA_TYPE, 4));
+    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
+    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
+    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
+    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr));
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
+    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
+    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
+    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
+    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
+    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
+    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
+    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
+
+    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
+    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
+    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
+    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
+    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
+    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
+    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
+    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
+
+    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
+    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
+    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
+    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
+    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
+    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
+    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
+    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
+
+    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
+    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
+    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
+    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
+    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
+    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
+    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
+    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
+
+    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
+    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
+    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
+    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
+    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
+    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
+    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
+    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
+
+    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
+    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
+    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
+    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
+    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
+    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
+    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
+    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
+
+    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
+    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
+    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
+    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
+    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
+    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
+    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
+    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
+
+    // Compute the 8x4 intermediate tensor
+    VEC_DATA_TYPE(float, 4)
+    comm_fact0, comm_fact1, comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
+
+    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
+    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
+
+    // Compute the 4x4 output tile
+    comm_fact0 = tmp_col1 + tmp_col2;
+    comm_fact1 = tmp_col3 + tmp_col4;
+    comm_fact2 = tmp_col5 + tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
+    VEC_DATA_TYPE(float, 4)
+    out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
+
+    comm_fact0 = tmp_col1 - tmp_col2;
+    comm_fact1 = tmp_col3 - tmp_col4;
+    comm_fact2 = tmp_col5 - tmp_col6;
+
+    VEC_DATA_TYPE(float, 4)
+    out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
+    VEC_DATA_TYPE(float, 4)
+    out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
+
+    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
+
+    out_col0 += (VEC_DATA_TYPE(float, 4))b;
+    out_col1 += (VEC_DATA_TYPE(float, 4))b;
+    out_col2 += (VEC_DATA_TYPE(float, 4))b;
+    out_col3 += (VEC_DATA_TYPE(float, 4))b;
+#endif // defined(HAS_BIAS)
+
+    // Store the output tile
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
+    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL),
+                    VEC_DATA_TYPE(DATA_TYPE, 4)),
+            0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_2x2_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_3x3_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nchw(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bias)
+#endif // defined(HAS_BIAS)
+)
+{
+    winograd_output_transform_4x4_5x5_nchw(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes
+#if defined(HAS_BIAS)
+                                           ,
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes
+#endif // defined(HAS_BIAS)
+                                          );
+}
+
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nhwc/batch_to_space.cl b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
new file mode 100644
index 0000000000..a5334525fe
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[in]  block_shape_ptr                      Pointer to the source tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                 Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                   block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_stride_y                 Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  block_shape_step_y                   block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    VECTOR_DECLARATION(block_shape),
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in    = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+    const int block_x = *((__global int *)vector_offset(&block, 0));
+    const int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void batch_to_space_static_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int block_x = BLOCK_SHAPE_X;
+    const int block_y = BLOCK_SHAPE_Y;
+
+    const int r = (BATCH_SIZE / (block_x * block_y));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0);
+    const int w = batch_id % r;
+
+    const int out_x = x * block_x + (batch_id / r) % block_x;
+    const int out_y = y * block_y + (batch_id / r) / block_x;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
new file mode 100644
index 0000000000..cb2da1bd99
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                             Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  mean_stride_x                        Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                          mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes   The offset of the first element in the mean source tensor
+ * @param[in]  var_ptr                              Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in]  var_stride_x                         Stride of the var tensor in X dimension (in bytes)
+ * @param[in]  var_step_x                           var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  var_offset_first_element_in_bytes    The offset of the first element in the var source tensor
+ * @param[in]  beta_ptr                             Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  beta_stride_x                        Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in]  beta_step_x                          beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  beta_offset_first_element_in_bytes   The offset of the first element in the beta source tensor
+ * @param[in]  gamma_ptr                            Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in]  gamma_stride_x                       Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in]  gamma_step_x                         gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  gamma_offset_first_element_in_bytes  The offset of the first element in the gamma source tensor
+ * @param[in]  epsilon                              Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+                                            TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+                                            VECTOR_DECLARATION(mean),
+                                            VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+                                            VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+                                            VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+                                            float epsilon)
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+    __global uchar *output_addr = input_ptr;
+#else  /* IN_PLACE */
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *var_addr  = var_ptr + var_offset_first_element_in_bytes + x_offs;
+#ifndef USE_DEFAULT_BETA
+    __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+    __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_GAMMA */
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    data = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    denominator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    numerator = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    x_bar = 0;
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res0 = 0;
+
+    data        = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+    denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
+    denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+    // Calculate x bar and store results
+    numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+    numerator = SUB_OP(data, numerator);
+    x_bar     = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
+
+    res0 = MUL_OP(gamma_vec, x_bar);
+#else  /* USE_DEFAULT_GAMMA */
+    // gamma is equal to 1, no need to perform multiplications
+    res0 = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
+    // beta is not zero, hence we need to perform the addition
+    res0 = ADD_OP(res0, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
new file mode 100644
index 0000000000..233beb3aa9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
@@ -0,0 +1,160 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res)                \
+    ({                                                      \
+        div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+        uint r  = div_res * (y);                            \
+        mod_res = (x)-r;                                    \
+    })
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+
+/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The third dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The first dimension of the tensor must be given as a preprocessor argument using -DSRC_DIM_X=num. e.g. -DSRC_DIM_X=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ *       K is equal to num_channels / num_groups.
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_w                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
+                                   TENSOR4D_DECLARATION(dst))
+{
+    // Offset computation
+    const uint curr_out_channel = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); // output feature map
+
+    uint z        = 0;
+    uint batch_id = 0;
+    // Compute curr_channel and batch_id
+    DIV_MOD_UINT(get_global_id(2), (uint)SRC_DIM_Z, batch_id, z);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    curr_out_channels = (VEC_DATA_TYPE(uint, VEC_SIZE))(curr_out_channel) + VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    in_channels = (curr_out_channels * (VEC_DATA_TYPE(uint, VEC_SIZE))(K)) % (VEC_DATA_TYPE(uint, VEC_SIZE))(SRC_DIM_X) + (curr_out_channels / (VEC_DATA_TYPE(uint, VEC_SIZE))(NUM_GROUPS));
+
+    // Load the values
+    const __global DATA_TYPE *input_ptr = (const __global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + z * src_stride_z + batch_id * src_stride_w);
+
+#if VEC_SIZE == 1
+    DATA_TYPE out0 = *((const __global * DATA_TYPE)(input_ptr) + in_channels);
+#elif VEC_SIZE == 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1)
+    };
+#elif VEC_SIZE == 3
+    VEC_DATA_TYPE(DATA_TYPE, 3)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2)
+    };
+#elif VEC_SIZE == 4
+    VEC_DATA_TYPE(DATA_TYPE, 4)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3)
+    };
+#elif VEC_SIZE == 8
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3),
+        *(input_ptr + in_channels.s4),
+        *(input_ptr + in_channels.s5),
+        *(input_ptr + in_channels.s6),
+        *(input_ptr + in_channels.s7)
+    };
+#elif VEC_SIZE == 16
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    out0 =
+    {
+        *(input_ptr + in_channels.s0),
+        *(input_ptr + in_channels.s1),
+        *(input_ptr + in_channels.s2),
+        *(input_ptr + in_channels.s3),
+        *(input_ptr + in_channels.s4),
+        *(input_ptr + in_channels.s5),
+        *(input_ptr + in_channels.s6),
+        *(input_ptr + in_channels.s7),
+        *(input_ptr + in_channels.s8),
+        *(input_ptr + in_channels.s9),
+        *(input_ptr + in_channels.sa),
+        *(input_ptr + in_channels.sb),
+        *(input_ptr + in_channels.sc),
+        *(input_ptr + in_channels.sd),
+        *(input_ptr + in_channels.se),
+        *(input_ptr + in_channels.sf)
+    };
+#endif // VEC_SIZE == 1
+
+    __global uchar *output_ptr = dst_ptr + curr_out_channel * sizeof(DATA_TYPE) + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+    STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
new file mode 100644
index 0000000000..5464a4bef8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All.
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nhwc(
+    TENSOR3D_DECLARATION(input),
+    const int batch_id,
+    TENSOR4D_DECLARATION(output))
+{
+    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
new file mode 100644
index 0000000000..238d3a7921
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  scale                                Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nhwc(
+    TENSOR3D_DECLARATION(input),
+    TENSOR3D_DECLARATION(output),
+    __global float *scale)
+{
+    // Get pixels pointer
+    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
+    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi = (int)(get_global_id(0) * VEC_SIZE);
+    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+    scale -= max(xi - (int)LAST_ACCESSED_X, 0);
+
+    // Load data
+    VEC_DATA_TYPE(int, VEC_SIZE)
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+    // Create scale vectors
+    const VEC_DATA_TYPE(float, VEC_SIZE)
+    vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
+
+    // Dequantize
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+    // Store result
+    VSTORE(VEC_SIZE)
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else  // !defined(LAST_ACCESSED_X)
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
new file mode 100644
index 0000000000..75a7a0f004
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "helpers_asymm.h"
+#include "tile_helpers.h"
+
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the direct convolution.
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16/QASYMM8/QASYMM8_SIGNED
+ * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width*height) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The number of K0 inner accumulations must be passed at compile time using -DK0 (e.g. -DK0=2)
+ * @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The zero value must be passed at compile time using -DZERO_VALUE (e.g. -DZERO_VALUE=0)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ *
+ *@note In case of QASYMM8/QASYMM8_SIGNED, the following extra information must be passed at compile time:
+ * - -DIS_QUANTIZED
+ * - The destination quantization multiplier e.g. -DDST_MULTIPLIER=1234
+ * - The destination quantization shift e.g. -DDST_SHIFT=4
+ * - The destination offset e.g. -DDST_OFFSET=4
+ * - The source offset e.g. -DSRC_OFFSET=4
+ * - The weights offset e.g. -DWEI_OFFSET=4
+ * - The quantized zero value e.g. -DZERO_VALUE=4
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32/QASYMM8
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void direct_convolution_nhwc(
+    TENSOR4D(src, SRC_TENSOR_TYPE),
+    TENSOR4D(dst, DST_TENSOR_TYPE),
+    TENSOR4D(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _ISRC_CHANNELS SRC_CHANNELS
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
+
+    // If quantized, the output tile has to be quantized first before being stored to global memory
+#if defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE cq
+#else // defined(IS_QUANTIZED)
+#define _IOUTPUT_TILE c
+#endif // defined(IS_QUANTIZED)
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH x HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);           // BATCH SIZE IDX
+
+    // .v    = access the whole vector (OpenCL vector)
+    // .s[x] = access the vector element at position x (scalar access)
+    TILE(int, M0, 1, xi);
+    TILE(int, M0, 1, yi);
+
+    // Convert the linear index to coordinate
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        xi[i].v = ((mout + i) % _IDST_WIDTH) * STRIDE_X;
+        yi[i].v = ((mout + i) / _IDST_WIDTH) * STRIDE_Y;
+        xi[i].v -= PAD_LEFT;
+        yi[i].v -= PAD_TOP;
+    })
+
+    // Initialize the accumulators
+    TILE(ACC_DATA_TYPE, M0, N0, c);
+
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        c[i].v = 0;
+    })
+
+    for(int i = 0; i < (_IWEI_WIDTH * _IWEI_HEIGHT); ++i)
+    {
+        int ck = 0;
+        int xk = i % _IWEI_WIDTH;
+        int yk = i / _IWEI_WIDTH;
+
+        int k = 0;
+        for(; k <= (_ISRC_CHANNELS - K0); k += K0)
+        {
+            TILE(SRC_DATA_TYPE, M0, K0, a);
+            TILE(WEI_DATA_TYPE, N0, K0, b);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, K0, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            T_LOAD(WEI_DATA_TYPE, N0, K0, WEI_TENSOR_TYPE, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a, b, c);
+
+            // Apply the offset correction (correction usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+            ck += K0;
+        }
+
+        // We voluntarily use SRC_CHANNELS rather than _DSRC_CHANNELS
+        // This #if directive should be removed in case of dynamic tensor support
+#if((SRC_CHANNELS % K0) != 0)
+        // Left-over accumulations
+        for(; k < _ISRC_CHANNELS; ++k)
+        {
+            TILE(SRC_DATA_TYPE, M0, 1, a);
+            TILE(WEI_DATA_TYPE, N0, 1, b);
+
+            LOOP_UNROLLING(int, i, 0, 1, M0,
+            {
+                a[i].v = ZERO_VALUE;
+            })
+
+            // Load tile from the src tensor
+            T_LOAD_NHWC_INDIRECT(SRC_DATA_TYPE, M0, 1, SRC_TENSOR_TYPE, src, bout, yk, xk, ck, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, xi, yi, a);
+
+            // Load tile from the weights tensor
+            // The T_LOAD for the left-over elements can only use BUFFER because we load one element per iteration
+            T_LOAD(WEI_DATA_TYPE, N0, 1, BUFFER, wei, ck, cout * _IY_MULTIPLIER + i, _IY_MULTIPLIER, wei_stride_y, b);
+
+            // Compute the matrix multiplication between two tiles
+            T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a, b, c);
+
+            // Apply the offset correction (operation usually needed for asymmetric quantized computation)
+            // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+            T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, 1, SRC_OFFSET, WEI_OFFSET, a, b, c);
+
+            ++ck;
+        }
+#endif // ((SRC_CHANNELS % K0) != 0)
+    }
+
+    // Offset correction required for the quantized asymmetric computation
+    // The computation is not performed if both SRC_OFFSET and WEI_OFFSET are zero
+    T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * _ISRC_CHANNELS * SRC_OFFSET * WEI_OFFSET), c);
+
+#if defined(HAS_BIAS)
+    TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+    T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+
+#endif // HAS_BIAS
+
+    TILE(uint, M0, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, i, 0, 1, M0,
+    {
+        dst_indirect_y[i].v = (uint)min(mout + i, (int)(_IDST_WIDTH * _IDST_HEIGHT) - 1);
+        dst_indirect_y[i].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+    })
+
+    bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+#if defined(IS_QUANTIZED)
+
+    TILE(DST_DATA_TYPE, M0, N0, cq);
+
+    // Quantize the tile
+    T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
+#endif // defined(IS_QUANTIZED)
+
+    // Apply activation
+    T_ACTIVATION(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, _IOUTPUT_TILE, _IOUTPUT_TILE);
+
+    // _IOUTPUT_TILE: c = fp32/fp16, cq=qasymm8
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE, dst, cout, dst_stride_y, x_cond, _IOUTPUT_TILE, dst_indirect_y);
+
+#undef _IWEI_WIDTH
+#undef _IWEI_HEIGHT
+#undef _ISRC_WIDTH
+#undef _ISRC_HEIGHT
+#undef _ISRC_CHANNELS
+#undef _IDST_WIDTH
+#undef _IDST_HEIGHT
+#undef _IDST_CHANNELS
+#undef _IY_MULTIPLIER
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
new file mode 100644
index 0000000000..d2e7e45ada
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "helpers_asymm.h"
+#include "tile_helpers.h"
+
+#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the depthwise convolution for floating-point data types (F32/F16)
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The accumulation data type must be passed at compile time using -DACC_DATA_TYPE (e.g. -DDATA_TYPE_PROMOTED=half)
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=float)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=float)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=float)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=float)
+ * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note Only the following configurations of M0 and N0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
+ *  - N0 = 2, 3, 4, 8, 16 (only 4, 8 and 16 if WEI_TENSOR_TYPE=IMAGE)
+ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type: F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  wei_ptr                           Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_x                      Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  wei_step_x                        wei_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  wei_stride_y                      Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_step_y                        wei_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  wei_stride_z                      Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_step_z                        wei_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  wei_stride_w                      Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_step_w                        wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  wei_offset_first_element_in_bytes The offset of the first element in the bias matrix
+ * @param[in]  bia_ptr                           (Optional) Pointer to the bias tensor Supported data type: same as @p src_ptr (if F32/F16) or S32 (if QASYMM8/QASYMM8_SIGNED)
+ * @param[in]  bia_stride_x                      (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                        (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix
+ */
+//! @endcond
+__kernel void dwc_native_fp_nhwc(
+    TENSOR4D(src, SRC_TENSOR_TYPE),
+    TENSOR4D(dst, DST_TENSOR_TYPE),
+    TENSOR4D(wei, WEI_TENSOR_TYPE)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
+#define _IN0_A N0          // Cols tile A
+#define _IM0_B _IWEI_WIDTH // Rows tile B
+#define _IN0_B N0          // Cols tile B
+#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
+#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                        // defined(BATCHED_EXECUTION)
+
+    int xi = xo * STRIDE_X;
+    int yi = yo * STRIDE_Y;
+    xi -= PAD_LEFT;
+    yi -= PAD_TOP;
+
+    int d = 0;
+#if DEPTH_MULTIPLIER != 1
+    for(; d < DEPTH_MULTIPLIER; d++)
+#endif // DEPTH_MULTIPLIER != 1
+    {
+        TILE(ACC_DATA_TYPE, M0, N0, c);
+
+        // Reset accumulators
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            c[i].v = 0;
+        })
+
+#if _IWEI_HEIGHT <= 5
+        LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else  // _IWEI_HEIGHT <= 5
+        for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
+                       {
+                           TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+
+                           LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+        {
+            a[i].v = 0;
+        })
+
+        // Load tile from the src tensor (TILE A)
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+
+        TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+
+        // Load tile from the weights tensor (TILE B)
+        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b);
+
+        // Optimized path for STRIDE_X == 1
+        // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+            {
+                c[m0].v += a[xk + m0].v *b[xk].v;
+            })
+        })
+                       }
+#if _IWEI_HEIGHT <= 5
+                      )
+#endif // _IWEI_HEIGHT <= 5
+
+#if defined(HAS_BIAS)
+        TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+        T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, (cout * DEPTH_MULTIPLIER) + d, 0, 0, 0, bias0);
+
+        // c = c + bias[broadcasted]
+        T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+#endif // HAS_BIAS
+
+        T_ACTIVATION(ACC_DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, c, c);
+
+        TILE(uint, M0, 1, dst_indirect_y);
+
+        bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+        if(x_cond)
+        {
+            LOOP_UNROLLING(int, m0, 0, 1, M0,
+            {
+                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
+                VSTORE_PARTIAL(N0, PARTIAL_N0)
+                (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+            })
+        }
+        else
+        {
+            LOOP_UNROLLING(int, m0, 0, 1, M0,
+            {
+                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
+                VSTORE(N0)
+                (c[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+            })
+        }
+    }
+}
+#endif // defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
new file mode 100644
index 0000000000..1bc58b6e26
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_STR(A_DATA_TYPE, B_DATA_TYPE) CALCULATE_WEIGHTS_OFFSET_CORRECTION_##A_DATA_TYPE##_##B_DATA_TYPE
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_char (0)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_uchar (0)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_uchar_char (128)
+#define CALCULATE_WEIGHTS_OFFSET_CORRECTION_char_uchar (-128)
+
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_TENSOR() \
+    ({})
+
+#define T_LOAD_MULTIPLIERS_SHIFT_PER_CHANNEL()                                                                           \
+    TILE(DST_MULTIPLIERS_DATA_TYPE, 1, N0, multipliers);                                                                 \
+    TILE(DST_SHIFTS_DATA_TYPE, 1, N0, shifts);                                                                           \
+    T_LOAD(DST_MULTIPLIERS_DATA_TYPE, 1, N0, BUFFER, dst_multipliers, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, multipliers); \
+    T_LOAD(DST_SHIFTS_DATA_TYPE, 1, N0, BUFFER, dst_shifts, cout *DEPTH_MULTIPLIER + d, 0, 0, 0, shifts);
+
+#define T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE)
+#define T_LOAD_MULTIPLIERS_SHIFT_STR(QUANTIZATION_TYPE) T_LOAD_MULTIPLIERS_SHIFT_##QUANTIZATION_TYPE()
+
+#if defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
+//! @cond Doxygen_Suppress
+/** OpenCL kernel to compute the depthwise convolution for quantized data types
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The convolution strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y (e.g. -DSTRIDE_X=2, -DSTRIDE_Y=2)
+ * @note The convolution dilations must be passed at compile time using -DDILATION_X and -DDILATION_Y (e.g. -DDILATION_X=2, -DDILATION_Y=2)
+ * @note The spatial dimensions of the weights must be passed at compile time using -DWEI_WIDTH and -DWEI_HEIGHT (e.g. -DWEI_WIDTH=9, -DWEI_HEIGHT=9)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The spatial dimensions of the destination tensor must be passed at compile time using -DDST_WIDTH and -DDST_HEIGHT (e.g. -DDST_WIDTH=96, -DDST_HEIGHT=64)
+ * @note The channels of the source tensor must be passed at compile time using -DSRC_CHANNELS (e.g. -DSRC_CHANNELS=64)
+ * @note The channels of the destination tensor must be passed at compile time using -DDST_CHANNELS (e.g. -DDDST_CHANNELS=64)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the source tensor must be passed at compile time using -DSRC_TENSOR_TYPE (e.g. -DSRC_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the weights tensor must be passed at compile time using -DWEI_TENSOR_TYPE (e.g. -DWEI_TENSOR_TYPE=BUFFER)
+ * @note The tensor type ("BUFFER" or "IMAGE") of the destination tensor must be passed at compile time using -DDST_TENSOR_TYPE (e.g. -DDST_TENSOR_TYPE=BUFFER)
+ * @note The data type of the source tensor must be passed at compile time using -DSRC_DATA_TYPE (e.g. -DSRC_DATA_TYPE=int8)
+ * @note The data type of the weights tensor must be passed at compile time using -DWEI_DATA_TYPE (e.g. -DWEI_DATA_TYPE=int8)
+ * @note The data type of the destination tensor must be passed at compile time using -DDST_DATA_TYPE (e.g. -DDST_DATA_TYPE=int8)
+ * @note The data type of the accumulators must be passed at compile time using -DACC_DATA_TYPE (e.g. -DACC_DATA_TYPE=int)
+ * @note The number of M0 rows (width) to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of N0 output channels to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note The size of the partial store block in the first dimension must be passed at compile time using -DPARTIAL_N0 (e.g. -DPARTIAL_N0=1)
+ * @note The activation type must be passed at compile using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note The A and B variables required by some activation functions must be passed at compile time using -DA_VAL= and -DB_VAL= respectively
+ * @note The quantization offset used for both the per-tensor and per-channel quantization must be passed at compile using -DDST_OFFSET (e.g., -DDST_OFFSET=3)
+ * @note The quantization shift for the per-tensor quantization must be passed at compile time using -DDST_SHIFT (e.g., -DDST_SHIFT=1)
+ * @note The quantization multiplier for the per-tensor quantization must be passed at compile using -DDST_MULTIPLIER (e.g., -DDST_MULTIPLER=121432)
+ * @note Only the following configurations of M0 and N0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, .... n (M0 != 1 with STRIDE_X == 1 && DILATION_X == 1 only)
+ *  - N0 = 2, 3, 4, 8, 16
+ * @note The number of rows to read from the src tensor must be passed at compile time using -DM0_A (e.g., -DM0_A=3). M0_A must be equal to WEI_WIDTH + (M0 - 1)
+ *
+ * @param[in]  src_ptr                                       Pointer to the source tensor. Supported data type: QSYMM8/QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
+ * @param[in]  src_stride_x                                  Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                                    src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                                  Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                                    src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                                  Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                                    src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                                  Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                                    src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes             The offset of the first element in the source tensor
+ * @param[out] dst_ptr                                       Pointer to the destination tensor. Supported data type: same as @p src_ptr
+ * @param[in]  dst_stride_x                                  Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                                    dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                  Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                                    dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                  Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                                    dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                                  Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                                    dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes             The offset of the first element in the destination tensor
+ * @param[in]  wei_ptr                                       Pointer to the weights tensor. Supported data type: same as @p src_ptr
+ * @param[in]  wei_stride_x                                  Stride of the weights tensor in X dimension (in bytes)
+ * @param[in]  wei_step_x                                    wei_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  wei_stride_y                                  Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in]  wei_step_y                                    wei_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  wei_stride_z                                  Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in]  wei_step_z                                    wei_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  wei_stride_w                                  Stride of the weights tensor in W dimension (in bytes)
+ * @param[in]  wei_step_w                                    wei_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  wei_offset_first_element_in_bytes             The offset of the first element in the weights tensor
+ * @param[in]  dst_multipliers_ptr                           Pointer to the destination multipliers tensor for the per-channel quantization. Supported data type: S32
+ * @param[in]  dst_multipliers_stride_x                      Stride of the destination multipliers tensor in X dimension (in bytes)
+ * @param[in]  dst_multipliers_step_x                        dst_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_multipliers_offset_first_element_in_bytes The offset of the first element in the destination multipliers tensor
+ * @param[in]  dst_shifts_ptr                                Pointer to the destination shifts tensor for the per-channel quantization. Supported data type: S32
+ * @param[in]  dst_shifts_stride_x                           Stride of the destination shifts tensor in X dimension (in bytes)
+ * @param[in]  dst_shifts_step_x                             dst_shifts_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_shifts_offset_first_element_in_bytes      The offset of the first element in the destination shifts tensor
+ * @param[in]  bia_ptr                                       (Optional) Pointer to the bias tensor Supported data type: S32
+ * @param[in]  bia_stride_x                                  (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in]  bia_step_x                                    (Optional) bia_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  bia_offset_first_element_in_bytes             (Optional) The offset of the first element in the bias tensor
+ */
+//! @endcond
+__kernel void dwc_native_quantized_nhwc(
+    TENSOR4D(src, SRC_TENSOR_TYPE),
+    TENSOR4D(dst, DST_TENSOR_TYPE),
+    TENSOR4D(wei, WEI_TENSOR_TYPE),
+    VECTOR_DECLARATION(dst_multipliers),
+    VECTOR_DECLARATION(dst_shifts)
+#if defined(HAS_BIAS)
+    ,
+    VECTOR_DECLARATION(bia)
+#endif // defined(HAS_BIAS)
+)
+{
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _IWEI_WIDTH WEI_WIDTH
+#define _IWEI_HEIGHT WEI_HEIGHT
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _IDST_CHANNELS DST_CHANNELS
+#define _IM0_A M0_A        // _IWEI_WIDTH + (M0 - 1) Rows tile A (If M0 != 1, the tiles overlap of 1 element on the X dimension)
+#define _IN0_A N0          // Cols tile A
+#define _IM0_B _IWEI_WIDTH // Rows tile B
+#define _IN0_B N0          // Cols tile B
+#define _IBOUNDARY_CHECK (!((WEI_WIDTH == 1 && WEI_HEIGHT == 1 && PAD_LEFT == 0 && PAD_TOP == 0 && M0 == 1)))
+
+    const int cout = GET_SPATIAL_IDX(0, N0, PARTIAL_N0); // OFM
+    const int xo   = GET_SPATIAL_IDX(1, M0, 0);          // WIDTH
+#if defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0) % _IDST_HEIGHT; // HEIGHT
+    const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
+#else                                                         // defined(BATCHED_EXECUTION)
+    const int yo   = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
+    const int bout = 0;                        // BATCH SIZE IDX
+#endif                                                        // defined(BATCHED_EXECUTION)
+
+    int xi = xo * STRIDE_X;
+    int yi = yo * STRIDE_Y;
+    xi -= PAD_LEFT;
+    yi -= PAD_TOP;
+
+    int d = 0;
+#if DEPTH_MULTIPLIER != 1
+    for(; d < DEPTH_MULTIPLIER; d++)
+#endif // DEPTH_MULTIPLIER != 1
+    {
+        TILE(ACC_DATA_TYPE, M0, N0, c);
+
+        // Reset accumulators
+        LOOP_UNROLLING(int, i, 0, 1, M0,
+        {
+            c[i].v = 0;
+        })
+
+#if _IWEI_HEIGHT <= 5
+        LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
+#else  // _IWEI_HEIGHT <= 5
+        for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
+#endif // _IWEI_HEIGHT <= 5
+                       {
+                           TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+
+                           LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+        {
+            a[i].v = ZERO_VALUE;
+        })
+
+        // Load tile from the src tensor (TILE A)
+        T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+
+        TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+
+        // Load tile from the weights tensor (TILE B)
+        T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b);
+
+        // Optimized path for STRIDE_X == 1
+        // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+        LOOP_UNROLLING(int, m0, 0, 1, M0,
+        {
+            LOOP_UNROLLING(int, n0, 0, 1, N0,
+            {
+#if _IWEI_WIDTH <= 16
+#define DOT_DATA_TYPE SRC_DATA_TYPE
+#define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
+
+                // Optimized path for the dot instruction
+                TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
+                TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
+                ACC_DATA_TYPE offset_a = 0;
+                ACC_DATA_TYPE offset_b = 0;
+
+                LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+                {
+                    x0[0].s[xk] = a[xk + m0].s[n0];
+                    y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
+                })
+                DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
+                REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
+                REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
+                c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
+#else  // _IWEI_WIDTH <= 16
+                LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+                {
+                    c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
+                })
+#endif // _IWEI_WIDTH <= 16
+            })
+        })
+                       }
+#if _IWEI_HEIGHT <= 5
+                      )
+#endif // _IWEI_HEIGHT <= 5
+
+#if _IWEI_WIDTH <= 16
+        T_ADD_CONSTANT(ACC_DATA_TYPE, M0, N0, c, (_IWEI_WIDTH * _IWEI_HEIGHT * SRC_OFFSET * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION)), c);
+#endif // _IWEI_WIDTH <= 16
+
+#if defined(HAS_BIAS)
+        TILE(BIA_DATA_TYPE, 1, N0, bias0);
+
+        // Load bias
+        T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout * DEPTH_MULTIPLIER + d, 0, 0, 0, bias0);
+
+        // c = c + bias[broadcasted]
+        T_ADD_BROADCAST_X(ACC_DATA_TYPE, M0, N0, c, bias0, c);
+#endif // HAS_BIAS
+
+        T_LOAD_MULTIPLIERS_SHIFT(QUANTIZATION_TYPE);
+
+        // Quantize the tile
+        TILE(DST_DATA_TYPE, M0, N0, cq);
+        T_QUANTIZE8(ACC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, multipliers, shifts, cq);
+
+        // Perform activation
+        T_ACTIVATION_QUANTIZED(DST_DATA_TYPE, M0, N0, ACTIVATION_TYPE, DST_OFFSET, A_VAL, B_VAL, cq, cq);
+
+        bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
+
+        if(x_cond)
+        {
+            LOOP_UNROLLING(int, m0, 0, 1, M0,
+            {
+                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
+                VSTORE_PARTIAL(N0, PARTIAL_N0)
+                (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+            })
+        }
+        else
+        {
+            LOOP_UNROLLING(int, m0, 0, 1, M0,
+            {
+                int xi_out = min(xo + M0 - 1 - m0, (int)(_IDST_WIDTH) - 1);
+                VSTORE(N0)
+                (cq[M0 - 1 - m0].v, 0, (__global DST_DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + (uint)((cout * DEPTH_MULTIPLIER) + d) * sizeof(DST_DATA_TYPE) + (uint)xi_out * dst_stride_y + (uint)yo * dst_stride_z + (uint)bout * dst_stride_w));
+            })
+        }
+    }
+}
+#endif // defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_WIDTH) && defined(DST_HEIGHT) && defined(WEI_WIDTH) && defined(WEI_HEIGHT) && defined(N0) && defined(M0) && defined(DILATION_X) && defined(DILATION_Y) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl
new file mode 100644
index 0000000000..ac00c11283
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/im2col.cl
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
+
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
+
+/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
+ *  @name IM2COL1X9_NHWC_STORE
+ *
+ *  @note To use this macro for a 3x3 block, @p ROW has to be 0
+ *
+ * @param[in] VECTOR_SIZE          The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
+ * @param[in] DATA_TYPE            Data type of @p DATA
+ * @param[in] SRC_DEPTH            Input channel size / depth
+ * @param[in] DATA                 Value variable base name
+ * @param[in] ROW                  The row number to store. Supported: 0-8
+ * @param[in] OUTPUT_PTR           Output pointer
+ * @{
+ */
+#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)         \
+    const bool at_channel_boundary = get_global_id(0) == 0;                                                          \
+    if(at_channel_boundary)                                                                                          \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    }                                                                                                                \
+    else                                                                                                             \
+    {                                                                                                                \
+        IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)                    \
+    }
+#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
+#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+
+#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                 \
+    VSTORE(VECTOR_SIZE)                                                                           \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+
+#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH);                                    \
+    VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)                                                                \
+    (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+/** @}*/
+
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int3 offset   = 0;
+
+    // Clamp xi
+    int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
+    xi_offset *= (int3)src_stride_y;
+
+    // Out-of-bound condition for X
+    int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+    // yi == 0
+    // Clamp yi
+    // yi_coord is casted to unsigned int in order to use just a min() operation
+    // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+    // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
+    // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
+    // also causes y_cond (y padding condition) to be satisfied
+    yi_coord = yi - (int)PAD_TOP;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+    values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 1
+    // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+    yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with zeros
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // yi == 2
+    // Clamp yi_coord
+    yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+    // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+    yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+    // Compute offset
+    offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+    // Load input values
+    VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+    VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+    VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+    // Replace invalid values with PAD_VALUE
+    y_cond  = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+    values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+    values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+    values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+    // Store in a boundary-aware way to avoid padding
+    IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
+        \
+        int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT));                \
+        values0    = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
+        values1    = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
+        values2    = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
+        values3    = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
+        values4    = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
+        values5    = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
+        values6    = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
+        values7    = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
+        values8    = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1)));    \
+        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+    })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i)                                                                                         \
+    ({                                                                                                       \
+        yi_coord = yi - (int)PAD_TOP + i * DILATION_Y;                                                       \
+        yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));                                              \
+        \
+        offset0 = xi_offset0 + (yi_coord * (int)src_stride_z);                                               \
+        offset1 = xi_offset1 + (yi_coord * (int)src_stride_z);                                               \
+        \
+        VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0));            \
+        VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1));            \
+        VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2));            \
+        VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3));            \
+        VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4));            \
+        VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5));            \
+        VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6));            \
+        VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7));            \
+        VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1));               \
+        \
+        IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+    })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int  yi_coord = 0;
+    int8 offset0  = 0;
+    int  offset1  = 0;
+
+    // Clamp xi
+    int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+    int  xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+    xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+    xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+    xi_offset0 *= (int8)src_stride_y;
+    xi_offset1 *= (int)src_stride_y;
+
+    // Out-of-bound condition for X
+    int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+    int  x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+    IM2COL1x9(0);
+    IM2COL1x9(1);
+    IM2COL1x9(2);
+    IM2COL1x9(3);
+    IM2COL1x9(4);
+    IM2COL1x9(5);
+    IM2COL1x9(6);
+    IM2COL1x9(7);
+    IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes).
+ * @param[in]  dst_stride_w                      Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+    TENSOR3D_DECLARATION(src),
+    IMAGE_DECLARATION(dst),
+    uint src_stride_w,
+    uint dst_stride_w)
+{
+    // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+    const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+    const int ch           = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+    const int yo           = get_global_id(1);
+    const int batch        = get_global_id(2); // batch size
+
+    // Calculate input indices
+    const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+    const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+    // Get input and output address
+    __global uchar *input_ptr  = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+    __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+    int i = 0;
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+    {
+        // Clamp yi_coord
+        int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+        yi_coord     = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+
+        // Out-of-bound condition for Y
+        int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
+
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+        {
+            // Clamp xi_coord
+            int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+            xi_coord     = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+            // Out-of-bound condition for X
+            int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+            int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+            VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+            // Replace with PAD_VALUE if the value is out-of-bound
+            values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+            // Store in a boundary-aware way to avoid padding
+#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            const bool at_channel_boundary = get_global_id(0) == 0;
+            if(at_channel_boundary)
+            {
+                VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            else // at_channel_boundary
+#endif           // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+            {
+                VSTORE(VECTOR_SIZE)
+                (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+            }
+            i++;
+        }
+    }
+
+#ifdef HAS_BIAS
+    // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+    // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+    // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+    // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+    // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+    if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+    {
+        *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
+    }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalization_layer.cl b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
new file mode 100644
index 0000000000..7e35e161c8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(WIDTH_SIZE)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input),
+                                                 TENSOR3D_DECLARATION(output))
+{
+    // Offset computation
+    const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int left_slice  = max((int)0, (int)x_offs - (int)RADIUS);
+    const int right_slice = min((int)WIDTH_SIZE - 1, (int)x_offs + (int)RADIUS);
+
+    for(int i = left_slice; i <= right_slice; ++i)
+    {
+        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * sizeof(DATA_TYPE)));
+        acc    = ADD_OP(acc, MUL_OP(values, values));
+    }
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x_offs * sizeof(DATA_TYPE))), normalized);
+
+    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(WIDTH_SIZE)
+
+#if defined(NUM_SLICES) && defined(DIM1_SIZE)
+/** Apply in-map normalization when tensors are in the NHWC data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
+                                              TENSOR3D_DECLARATION(output))
+{
+    // Offset computation
+    const uint x_offs       = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    const int  current_cols = get_global_id(1);
+    const int  current_rows = get_global_id(2);
+
+    // Address computation
+    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
+    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + current_cols * output_stride_y + current_rows * output_stride_z;
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    acc = 0;
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    coeff_v = SQCVT_SAT(COEFF);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    beta_v = SQCVT_SAT(BETA);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    kappa_v = SQCVT_SAT(KAPPA);
+
+    const int first_col = max(0, current_cols - (int)RADIUS);
+    const int last_col  = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
+
+#if defined(IN_MAP_2D)
+    const int first_row = max(0, current_rows - (int)RADIUS);
+    const int last_row  = min((int)NUM_SLICES - 1, current_rows + (int)RADIUS);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+    for(int j = first_row; j <= last_row; ++j)
+    {
+#else  // defined(IN_MAP_2D)
+    const int j = current_rows;
+#endif /* defined(IN_MAP_2D) */
+        for(int i = first_col; i <= last_col; ++i)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * input_stride_y + j * input_stride_z));
+            acc    = ADD_OP(acc, MUL_OP(values, values));
+        }
+#if defined(IN_MAP_2D)
+    }
+#endif /* defined(IN_MAP_2D) */
+
+    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized = POW_OP(acc, beta_v);
+    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows *output_stride_z)), normalized);
+
+    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(NUM_SLICES) && defined(DIM1_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..86c33499e2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
+                                              TENSOR3D_DECLARATION(dst),
+                                              VECTOR_DECLARATION(mean),
+                                              VECTOR_DECLARATION(std))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
+
+    const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+    const TYPE curr_std  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr);
+
+    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+    TYPE res0 = (data - curr_mean) / curr_std;
+
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..7bc3c15a63
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ *
+ * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
+ * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
+                                                 TENSOR3D_DECLARATION(dst),
+                                                 VECTOR_DECLARATION(mean),
+                                                 VECTOR_DECLARATION(std))
+{
+    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
+    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
+    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr), VEC_DATA_TYPE(float, VEC_SIZE));
+    data_flt = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
+
+    // Perform normalization
+    VEC_DATA_TYPE(float, VEC_SIZE)
+    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+    const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
new file mode 100644
index 0000000000..5b59ff5088
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
+#else  //DST_BATCH_SIZE != 1
+    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
+    int idx_out_n   = 0;
+#endif // DST_BATCH_SIZE != 1
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(EXCLUDE_PADDING)
+    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#else  // defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    // Global pooling path
+    for(int y = 0; y < POOL_SIZE_Y; ++y)
+    {
+#pragma unroll 8
+        for(int x = 0; x < POOL_SIZE_X; ++x)
+        {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+#pragma unroll 8
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+            data0;
+#if defined(FP_MIXED_PRECISION)
+            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+            // Raise to power of 2 for L2 Pooling
+            data0 *= data0;
+#endif // defined(POOL_L2)
+            res0 = POOL_OP(res0, data0);
+        }
+    }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+
+/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
+ * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
+ * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
+ * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
+ * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
+ * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2x2_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output)
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+    ,
+    TENSOR4D_DECLARATION(indices)
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+)
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else  //SRC_BATCH_SIZE != 1
+    int idx_out_h = get_global_id(2);
+    int idx_out_n = 0;
+#endif // SRC_BATCH_SIZE != 1
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+                                           output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
+
+    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+
+    int x0 = pool_x_s + idx_in_w;
+    int y0 = pool_y_s + idx_in_h;
+    int x1 = pool_x_e - 1 + idx_in_w;
+    int y1 = pool_y_e - 1 + idx_in_h;
+
+    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
+
+#if defined(FP_MIXED_PRECISION)
+    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else  // defined(FP_MIXED_PRECISION)
+    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if !defined(POOL_MAX)
+    if(filter_size != 4)
+    {
+        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
+        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
+        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
+        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
+
+        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
+        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
+        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
+        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
+    }
+#endif // !defined(POOL_MAX)
+
+#if defined(POOL_L2)
+    // Raise to power of 2 for L2 Pooling
+    data0 *= data0;
+    data1 *= data1;
+    data2 *= data2;
+    data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = data0;
+    res0 = POOL_OP(res0, data1);
+    res0 = POOL_OP(res0, data2);
+    res0 = POOL_OP(res0, data3);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#if defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#else  // !defined(EXCLUDE_PADDING)
+    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
+#endif // defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+    // Take square root of the result in L2 pooling
+    res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+    // Store result
+#if defined(FP_MIXED_PRECISION)
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else  // defined(FP_MIXED_PRECISION)
+    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+
+    // This part is used to return the index of the maximum value
+    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
+
+    // note: Batch dimension does not contribute in the offset contribution
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    base_index = (uint)idx_out_c;
+
+    base_index += VEC_OFFS(uint, VEC_SIZE);
+
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+    VEC_DATA_TYPE(uint, VEC_SIZE)
+    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+
+    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
+    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
+
+    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
+                                           indices_stride_w;
+
+    // Store result
+    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..46268a4a88
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
+#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
+#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                  \
+    {                                                                                                                                                                 \
+        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
+        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
+        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE));                                                \
+    }
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=uchar. Supported data types are QASYMM8/QASYMM8_SIGNED
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=int
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ * @note If the output has be requantized, -DOFFSET_IN1, -DOFFSET_OUT, -DSCALE_IN1 and -DSCALE_OUT muste be passed at compile time
+ *
+ * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nhwc(
+    TENSOR4D_DECLARATION(input),
+    TENSOR4D_DECLARATION(output))
+{
+    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+    int offset_c  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
+    int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+    // If batch size != 1, the batch size dimension is collapsed over the height dimension
+    int idx_out_h = get_global_id(2) % DST_HEIGHT;
+    int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else  //DST_BATCH_SIZE != 1
+    int idx_out_h   = get_global_id(2);
+    int idx_out_n   = 0;
+#endif // DST_BATCH_SIZE != 1
+
+    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + offset_c + idx_out_n * input_stride_w;
+
+    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + offset_c + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * output_stride_w;
+
+    int pool_x_s = max((int)0, -idx_in_w);
+    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+    int pool_y_s = max((int)0, -idx_in_h);
+    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = 0;
+#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif                                               // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
+
+    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+    res0 = INITIAL_VALUE;
+
+    for(int y = pool_y_s; y < pool_y_e; ++y)
+    {
+        for(int x = pool_x_s; x < pool_x_e; ++x)
+        {
+            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+            data;
+            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+            data0;
+
+            data  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+            data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+
+            res0 = POOL_OP(res0, data0);
+
+#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+            filter_size++;
+#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
+        }
+    }
+
+#if defined(POOL_AVG)
+    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
+#endif // defined(POOL_AVG)
+
+    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+    // Store result
+    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/remap.cl b/src/core/CL/cl_kernels/nhwc/remap.cl
new file mode 100644
index 0000000000..0b629fe6c9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/remap.cl
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#ifdef DEPTH_OUT
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8,F16.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8,F16.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ * @param[in]  border_val                         Value to use for border around input tensor when in CONSTANT border is selected
+ */
+__kernel void remap_nearest_neighbour_nhwc(
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
+    TENSOR4D_DECLARATION(mapx),
+    TENSOR4D_DECLARATION(mapy),
+    const float width,
+    const float height
+#ifdef CONSTANT_BORDER
+    ,
+    const DATA_TYPE border_val
+#endif // CONSTANT_BORDER
+)
+{
+    Tensor4D in   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out  = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+    Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT);
+    Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT);
+
+    float mapx_coord = (float) * (__global float *)mapx.ptr;
+    float mapy_coord = (float) * (__global float *)mapy.ptr;
+
+#ifdef CONSTANT_BORDER
+    if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1)
+    {
+        *((__global DATA_TYPE *)out.ptr) = border_val;
+        return;
+    }
+#else  // CONSTANT_BORDER
+    mapx_coord = clamp(mapx_coord, 0.0f, width - 1);
+    mapy_coord = clamp(mapy_coord, 0.0f, height - 1);
+#endif // CONSTANT_BORDER
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(mapx_coord), convert_int(mapy_coord), (get_global_id(2) / DEPTH_OUT)));
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ *     out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8,F16.
+ * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
+ * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8,F16.
+ * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
+ * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
+ * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
+ * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
+ * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in]  width                              Width of the input image
+ * @param[in]  height                             Height of the input image
+ * @param[in]  border_val                         Value to use for border around input tensor when in CONSTANT border is selected
+ */
+__kernel void remap_bilinear_nhwc(
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
+    TENSOR4D_DECLARATION(mapx),
+    TENSOR4D_DECLARATION(mapy),
+    const float width,
+    const float height
+#ifdef CONSTANT_BORDER
+    ,
+    const DATA_TYPE border_val
+#endif // CONSTANT_BORDER
+)
+{
+    Tensor4D in   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out  = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+    Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT);
+    Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT);
+
+    float mapx_coord = (float) * (__global float *)mapx.ptr;
+    float mapy_coord = (float) * (__global float *)mapy.ptr;
+
+#ifdef CONSTANT_BORDER
+    if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1)
+    {
+        *((__global DATA_TYPE *)out.ptr) = border_val;
+        return;
+    }
+#endif // CONSTANT_BORDER
+
+    const float new_xf     = floor(mapx_coord);
+    const float new_yf     = floor(mapy_coord);
+    const float clamped_x  = clamp(new_xf, 0.0f, width - 1);
+    const float clamped_x1 = clamp(new_xf + 1, 0.0f, width - 1);
+    const float clamped_y  = clamp(new_yf, 0.0f, height - 1);
+    const float clamped_y1 = clamp(new_yf + 1, 0.0f, height - 1);
+
+    float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
+
+    const float a  = mapx_coord - new_xf;
+    const float b  = 1.f - a;
+    const float a1 = mapy_coord - new_yf;
+    const float b1 = 1.f - a1;
+    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
+
+    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
+}
+
+#endif // DEPTH_OUT
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/reorg_layer.cl b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
new file mode 100644
index 0000000000..a340b0b8a2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
+    ({                                                        \
+        int offset = zo / (int)SRC_DEPTH;                     \
+        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
+        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
+        zi         = zo % SRC_DEPTH;                          \
+    })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+    int xo = get_global_id(1);
+    int yo = get_global_id(2);
+    int zo = get_global_id(0);
+    int xi, yi, zi;
+
+    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
new file mode 100644
index 0000000000..1ea5e73df1
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#if defined(DEPTH_OUT)
+/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
+ * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  out_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ */
+__kernel void scale_nearest_neighbour_nhwc(
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    float new_x = get_global_id(1) * scale_x;
+    float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+    float       new_x = (get_global_id(1) + 0.5f) * scale_x;
+    float       new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+#ifdef ALIGN_CORNERS
+    new_x = round(new_x);
+    new_y = round(new_y);
+#endif /* ALIGN_CORNERS */
+    const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
+    const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
+}
+
+/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
+ * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ *
+ */
+__kernel void scale_bilinear_nhwc(
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float new_x = get_global_id(1) * scale_x;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+
+    const float new_xf     = floor(new_x);
+    const float new_yf     = floor(new_y);
+    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
+    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
+    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
+    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
+
+#ifndef BORDER_MODE_REPLICATE
+    const bool  check_x  = (0.f <= new_xf && new_xf < input_width);
+    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
+    const bool  check_y  = (0.f <= new_yf && new_yf < input_height);
+    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
+    const float ins_0    = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+                                                                                                           (get_global_id(2) / DEPTH_OUT)))),
+                                  check_x && check_y);
+    const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
+                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
+                               check_x1 && check_y);
+    const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
+                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
+                               check_x && check_y1);
+    const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
+                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
+                               check_x1 && check_y1);
+    float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3);
+#else  /* BORDER_MODE_REPLICATE */
+    float4 ins        = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
+#endif /* BORDER_MODE_REPLICATE */
+
+    const float a  = new_x - new_xf;
+    const float b  = 1.f - a;
+    const float a1 = new_y - new_yf;
+    const float b1 = 1.f - a1;
+    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
+
+    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
+}
+#endif /* defined(DEPTH_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/scale_quantized.cl b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
new file mode 100644
index 0000000000..de9bb607b0
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+#include "warp_helpers_quantized.h"
+
+#if defined(DEPTH_OUT)
+/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
+ *
+ * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
+ * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
+ * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
+ *
+ * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
+ * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
+ * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
+ * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
+ * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
+ * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in]  input_width                       Input image width
+ * @param[in]  input_height                      Input image height
+ * @param[in]  scale_x                           The scale factor along x dimension
+ * @param[in]  scale_y                           The scale factor along y dimension
+ * @param[in]  constant_border_value             Constant border value to use
+ */
+__kernel void scale_bilinear_quantized_nhwc(
+    TENSOR4D_DECLARATION(in),
+    TENSOR4D_DECLARATION(out),
+    const float input_width,
+    const float input_height,
+    const float scale_x,
+    const float scale_y)
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
+    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
+
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float new_x = get_global_id(1) * scale_x;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+
+    const float new_xf     = floor(new_x);
+    const float new_yf     = floor(new_y);
+    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
+    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
+    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
+    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
+
+#ifndef BORDER_MODE_REPLICATE
+    const bool check_x  = (0.f <= new_xf && new_xf < input_width);
+    const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
+    const bool check_y  = (0.f <= new_yf && new_yf < input_height);
+    const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
+    const int ins_0     = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+                                                                                                      (get_global_id(2) / DEPTH_OUT)))),
+                                 check_x && check_y);
+    const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
+                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
+                             check_x1 && check_y);
+    const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
+                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
+                             check_x && check_y1);
+    const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
+                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
+                             check_x1 && check_y1);
+    int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3);
+#else  /* BORDER_MODE_REPLICATE */
+    int4 ins          = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
+                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
+                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
+#endif /* BORDER_MODE_REPLICATE */
+
+    const float  a      = new_x - new_xf;
+    const float  b      = 1.f - a;
+    const float  a1     = new_y - new_yf;
+    const float  b1     = 1.f - a1;
+    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
+
+    const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
+
+    DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
+
+    *((__global DATA_TYPE *)out.ptr) = res;
+}
+#endif /* defined(DEPTH_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_batch.cl b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
new file mode 100644
index 0000000000..785206e3b9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
+ * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
+ * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
+ * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
+ * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in]  batch_id                                  The output tensor batch id
+ * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nhwc(
+    TENSOR4D_DECLARATION(input),
+    IMAGE_DECLARATION(paddings),
+    VECTOR_DECLARATION(block_shape),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
+    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
+    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+    int block_x = *((__global int *)vector_offset(&block, 0));
+    int block_y = *((__global int *)vector_offset(&block, 1));
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
+ * @param[in]  batch_id                             The output tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    int block_x = BLOCK_SHAPE_X;
+    int block_y = BLOCK_SHAPE_Y;
+
+    const int out_x = get_global_id(1);
+    const int out_y = get_global_id(2);
+    const int z     = get_global_id(0);
+
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+    {
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
+        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+    }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
new file mode 100644
index 0000000000..d44e78d990
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
+ * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
+ * @param[in]  batch_id                             The input tensor batch id
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nhwc(
+    TENSOR4D_DECLARATION(input),
+    const int batch_id,
+    TENSOR3D_DECLARATION(output))
+{
+    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+    const int x = get_global_id(1);
+    const int y = get_global_id(2);
+    const int z = get_global_id(0) % r;
+
+    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/upsample_layer.cl b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
new file mode 100644
index 0000000000..74b9674a88
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NHWC)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nhwc(
+    TENSOR3D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    // Check if access on width gets out of bounds
+    // If it does shift access vector to access elements within bounds
+    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
+    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    data = vload16(0, (__global DATA_TYPE *)src.ptr);
+
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1));
+    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1));
+#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
+    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
new file mode 100644
index 0000000000..8d5fd3437f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
@@ -0,0 +1,1075 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SRC_DIM_Z)
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp)                                                                                               \
+    ({                                                                                                                             \
+        out.s0 = -tmp.s0 / 36.f;                                                                                                   \
+        out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f;                                            \
+        out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f;                                            \
+        out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
+        out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
+        out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+        out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
+        out.s7 = tmp.s6;                                                                                                           \
+    })
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+    // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 0
+    DATA_TYPE out00, out01, out02, out03, out04, out05;
+    out00 = (w00) / 16.f;
+    out01 = (-w00 - w01 - w02) / 24.f;
+    out02 = (-w00 + w01 - w02) / 24.f;
+    out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f;
+    out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;
+    out05 = (w02) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    DATA_TYPE out10, out11, out12, out13, out14, out15;
+    out10 = (-w00 - w10 - w20) / 24.f;
+    out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
+    out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
+    out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
+    out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
+    out15 = (-w02 - w12 - w22) / 6.f;
+
+    // Row 2
+    DATA_TYPE out20, out21, out22, out23, out24, out25;
+    out20 = (-w00 + w10 - w20) / 24.f;
+    out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
+    out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
+    out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
+    out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
+    out25 = (-w02 + w12 - w22) / 6.f;
+
+    // Row 3
+    DATA_TYPE out30, out31, out32, out33, out34, out35;
+    out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
+    out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
+    out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
+    out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
+    out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
+    out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
+
+    // Row 4
+    DATA_TYPE out40, out41, out42, out43, out44, out45;
+    out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
+    out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
+    out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
+    out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
+    out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
+    out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
+
+    // Row 5
+    DATA_TYPE out50, out51, out52, out53, out54, out55;
+    out50 = (w20) / 4.f;
+    out51 = (-w20 - w21 - w22) / 6.f;
+    out52 = (-w20 + w21 - w22) / 6.f;
+    out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;
+    out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;
+    out55 = (w22);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    // 36 channels for 3x3 kernels
+    // 6  channels for 3x1 or 1x3 kernels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out10;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out11;
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out12;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out13;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0    = 0.0f;
+    out0.s0 = w00;
+    out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
+    out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
+    out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
+    out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
+    out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
+    out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
+    out0.s7 = w04;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1    = 0.0f;
+    out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
+    out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+    out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
+    out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+                (w04 + w14 + w24 + w34 + w44)) / 405.f;
+    out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
+                (w04 + w14 + w24 + w34 + w44)) / 405.f;
+    out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
+                (w04 + w14 + w24 + w34 + w44)) / 810.f;
+    out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
+                (w04 + w14 + w24 + w34 + w44)) / 810.f;
+    out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2    = 0.0f;
+    out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
+    out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+    out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
+    out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+                (w04 - w14 + w24 - w34 + w44)) / 405.f;
+    out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
+                (w04 - w14 + w24 - w34 + w44)) / 405.f;
+    out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
+                (w04 - w14 + w24 - w34 + w44)) / 810.f;
+    out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
+                (w04 - w14 + w24 - w34 + w44)) / 810.f;
+    out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3    = 0.0f;
+    out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
+    out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
+                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+    out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
+                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
+    out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
+               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+    out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
+               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
+    out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+    out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
+    out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4    = 0.0f;
+    out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
+    out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
+                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+    out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
+                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
+    out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
+               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+    out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
+               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
+    out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+    out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
+               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
+    out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5    = 0.0f;
+    out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
+    out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
+                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+    out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
+                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
+    out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
+               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+    out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
+               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
+    out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+    out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
+               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
+    out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6    = 0.0f;
+    out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
+    out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
+                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+    out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
+                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
+    out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
+               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+    out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
+               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
+    out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+    out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
+               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
+    out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7    = 0.0f;
+    out7.s0 = w40;
+    out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
+    out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
+    out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
+    out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
+    out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
+    out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
+    out7.s7 = w44;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_7x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
+    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
+    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
+#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    // Load the values from the input tensor
+    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
+    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
+
+    DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
+    DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
+    DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
+    DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
+    DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
+    DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
+    DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    tmp = 0.0f;
+
+    // Row 0
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out0 = 0.0f;
+
+    out0.s0 = -w00 / 36.0f;
+    out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
+    out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
+    out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
+    out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
+    out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
+    out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
+    out0.s7 = w06;
+
+    out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    // Row 1
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out1 = 0.0f;
+
+    tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
+    tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
+    tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
+    tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
+    tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
+    tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
+    tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
+
+    OUTPUT_ROW_2x2_7x7(out1, tmp);
+
+    // Row 2
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out2 = 0.0f;
+
+    tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
+    tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
+    tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
+    tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
+    tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
+    tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
+    tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
+
+    OUTPUT_ROW_2x2_7x7(out2, tmp);
+
+    // Row 3
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out3 = 0.0f;
+
+    tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
+    tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
+    tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
+    tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
+    tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
+    tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
+    tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
+
+    OUTPUT_ROW_2x2_7x7(out3, tmp);
+
+    // Row 4
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out4 = 0.0f;
+
+    tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
+    tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
+    tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
+    tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
+    tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
+    tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
+    tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
+
+    OUTPUT_ROW_2x2_7x7(out4, tmp);
+
+    // Row 5
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out5 = 0.0f;
+
+    tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
+    tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
+    tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
+    tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
+    tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
+    tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
+    tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
+
+    OUTPUT_ROW_2x2_7x7(out5, tmp);
+
+    // Row 6
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out6 = 0.0f;
+
+    tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
+    tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
+    tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
+    tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
+    tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
+    tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
+    tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
+
+    OUTPUT_ROW_2x2_7x7(out6, tmp);
+
+    // Row 7
+    VEC_DATA_TYPE(DATA_TYPE, 8)
+    out7 = 0.0f;
+
+    tmp.s0 = w60;
+    tmp.s1 = w61;
+    tmp.s2 = w62;
+    tmp.s3 = w63;
+    tmp.s4 = w64;
+    tmp.s5 = w65;
+    tmp.s6 = w66;
+
+    OUTPUT_ROW_2x2_7x7(out7, tmp);
+
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+    int x0 = get_global_id(2); // idx filter
+    int y0 = get_global_id(0); // idx channel
+
+    // Get output address
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+    // Store the values across the channels
+    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
+    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
+    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+#endif // defined(SRC_DIM_Z)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR3D_DECLARATION(dst))
+{
+    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_offset_first_element_in_bytes);
+}
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
new file mode 100644
index 0000000000..4865982a55
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
+    ({                                                              \
+        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
+        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
+        comm_fact.s2 = 2.5f * tmp.s3;                               \
+        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
+        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
+        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+        \
+        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
+        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
+        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
+        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
+        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
+        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
+        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+    })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
+    ({                                                                                             \
+        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
+        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
+        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
+        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
+        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
+        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
+        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
+        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
+        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
+        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
+        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
+        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
+        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
+        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+    })
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+
+#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 6, 1, in);
+    TILE(DATA_TYPE, 6, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    TILE(DATA_TYPE, 6, 1, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v *= 4.0f;
+    })
+
+    com[0].v = in[2].v - 4.f * in[0].v;
+    com[1].v = in[3].v - 4.f * in[1].v;
+    com[2].v = in[4].v - 4.f * in[2].v;
+    com[3].v = in[5].v - 4.f * in[3].v;
+    com[4].v = in[3].v - in[1].v;
+    com[4].v = com[4].v + com[4].v;
+    com[5].v = in[4].v - in[2].v;
+
+    out[0].v = com[2].v - com[0].v;
+    out[1].v = com[2].v + com[1].v;
+    out[2].v = com[2].v - com[1].v;
+    out[3].v = com[5].v + com[4].v;
+    out[4].v = com[5].v - com[4].v;
+    out[5].v = com[3].v - com[1].v;
+
+    TILE(uint, 6, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 36, 1, in);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 6, 1, com);
+    TILE(DATA_TYPE, 36, 1, tmp);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        com[0].v         = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
+        com[1].v         = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
+        com[2].v         = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
+        com[3].v         = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
+        com[4].v         = in[3 * 6 + i].v - in[1 * 6 + i].v;
+        com[4].v         = com[4].v + com[4].v;
+        com[5].v         = in[4 * 6 + i].v - in[2 * 6 + i].v;
+        tmp[i + 0 * 6].v = com[2].v - com[0].v;
+        tmp[i + 1 * 6].v = com[2].v + com[1].v;
+        tmp[i + 2 * 6].v = com[2].v - com[1].v;
+        tmp[i + 3 * 6].v = com[5].v + com[4].v;
+        tmp[i + 4 * 6].v = com[5].v - com[4].v;
+        tmp[i + 5 * 6].v = com[3].v - com[1].v;
+    })
+
+    TILE(DATA_TYPE, 36, 1, out);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        com[0].v         = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v;
+        com[1].v         = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v;
+        com[2].v         = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v;
+        com[3].v         = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v;
+        com[4].v         = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
+        com[4].v         = com[4].v + com[4].v;
+        com[5].v         = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
+        out[i * 6 + 0].v = com[2].v - com[0].v;
+        out[i * 6 + 1].v = com[2].v + com[1].v;
+        out[i * 6 + 2].v = com[2].v - com[1].v;
+        out[i * 6 + 3].v = com[5].v + com[4].v;
+        out[i * 6 + 4].v = com[5].v - com[4].v;
+        out[i * 6 + 5].v = com[3].v - com[1].v;
+    })
+
+    // Compute destination address
+    TILE(uint, 36, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 8, 1, in);
+    TILE(DATA_TYPE, 8, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    TILE(DATA_TYPE, 1, 8, com);
+
+    com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
+    com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
+    com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
+    com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
+    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
+    com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
+    out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
+    out[1].s[0] = com[0].s[0] + com[0].s[1];
+    out[2].s[0] = com[0].s[0] - com[0].s[1];
+    out[3].s[0] = com[0].s[3] + com[0].s[2];
+    out[4].s[0] = com[0].s[3] - com[0].s[2];
+    out[5].s[0] = com[0].s[4] + com[0].s[5];
+    out[6].s[0] = com[0].s[4] - com[0].s[5];
+    out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
+
+    TILE(uint, 8, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, 1, in);
+    TILE(DATA_TYPE, 64, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 8, 8, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                                    // x
+        com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];                                    // x
+        com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                 // x
+        com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
+        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
+        com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
+        com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
+    })
+
+    TILE(DATA_TYPE, 8, 8, tmp);
+    tmp[0].v = com[6].v;
+    tmp[1].v = com[0].v + com[1].v;
+    tmp[2].v = com[0].v - com[1].v;
+    tmp[3].v = com[2].v + com[3].v;
+    tmp[4].v = com[2].v - com[3].v;
+    tmp[5].v = com[4].v + com[5].v;
+    tmp[6].v = com[4].v - com[5].v;
+    tmp[7].v = com[7].v;
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[0]         = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[1]         = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
+        com[0].s[2]         = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
+        com[0].s[3]         = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[5]         = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
+        out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
+        out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
+        out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
+        out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
+        out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
+        out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
+        out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
+        out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
+    })
+
+    TILE(uint, 64, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+    // All the tensor dimensions are passed at compile time.
+    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+    x -= PAD_LEFT;
+    y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 8, 1, in);
+    TILE(DATA_TYPE, 8, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v *= (DATA_TYPE) - 36.0f;
+    })
+
+    TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
+
+    com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
+    com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
+    com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
+    com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
+    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
+    com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
+    out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
+    out[1].s[0] = com[0].s[0] - com[0].s[1];
+    out[2].s[0] = com[0].s[0] + com[0].s[1];
+    out[3].s[0] = com[0].s[2] - com[0].s[3];
+    out[4].s[0] = com[0].s[2] + com[0].s[3];
+    out[5].s[0] = com[0].s[4] - com[0].s[5];
+    out[6].s[0] = com[0].s[4] + com[0].s[5];
+    out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
+
+    TILE(uint, 8, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, 1, in);
+    TILE(DATA_TYPE, 64, 1, out);
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the tile from a NHWC tensor
+    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+    TILE(DATA_TYPE, 8, 8, com);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
+        com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
+        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+        com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
+        com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
+        com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
+    })
+
+    TILE(DATA_TYPE, 8, 8, tmp);
+    tmp[0].v = com[6].v;
+    tmp[1].v = com[0].v - com[1].v;
+    tmp[2].v = com[0].v + com[1].v;
+    tmp[3].v = com[2].v - com[3].v;
+    tmp[4].v = com[2].v + com[3].v;
+    tmp[5].v = com[4].v - com[5].v;
+    tmp[6].v = com[4].v + com[5].v;
+    tmp[7].v = com[7].v;
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        com[0].s[0]         = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[1]         = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
+        com[0].s[2]         = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[3]         = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
+        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
+        com[0].s[5]         = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
+        out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
+        out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
+        out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
+        out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
+        out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
+        out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
+        out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
+        out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
+    })
+
+    TILE(uint, 64, 1, dst_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+        dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+    })
+
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER))
+{
+    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+                                                 src_stride_x,
+                                                 src_step_x,
+                                                 src_stride_y,
+                                                 src_step_y,
+                                                 src_stride_z,
+                                                 src_step_z,
+                                                 src_stride_w,
+                                                 src_step_w,
+                                                 src_offset_first_element_in_bytes,
+                                                 dst_ptr,
+                                                 dst_stride_x,
+                                                 dst_step_x,
+                                                 dst_stride_y,
+                                                 dst_step_y,
+                                                 dst_stride_z,
+                                                 dst_step_z,
+                                                 dst_stride_w,
+                                                 dst_step_w,
+                                                 dst_offset_first_element_in_bytes);
+}
+#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
new file mode 100644
index 0000000000..0fcd04e713
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
@@ -0,0 +1,1030 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
+
+    int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    TILE(DATA_TYPE, 8, N0, in);
+    TILE(DATA_TYPE, 2, N0, out);
+    TILE(uint, 8, 1, src_indirect_y);
+
+    // Calculate the indirect Y for the source tensor
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 8 channels to compose the 8x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // Compute out0 and out01
+    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
+    out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+    // Add bias
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+    T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 2, 1, dst_indirect_y);
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 2,
+    {
+        int y_c              = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 2,
+    {
+        int x_c              = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 64, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(DATA_TYPE, 16, N0, tmp);
+    TILE(uint, 64, 1, src_indirect_y);
+
+    // Calculate the indirect Y for the source tensor
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 64 channels to compose the 8x8 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        tmp[i * 2].v     = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
+        tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
+    })
+
+    // Compute the 2x2 output tile
+    LOOP_UNROLLING(int, i, 0, 1, 2,
+    {
+        out[i * 2].v     = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
+        out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
+    })
+
+#if defined(HAS_BIAS)
+    // Add bias
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 2,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 2,
+        {
+            int x_c                       = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
+            dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
+ */
+__kernel void winograd_output_transform_4x4_3x3_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    TILE(DATA_TYPE, 6, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(uint, 6, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // Compute out00, out01, out02 and out03
+    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
+    out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
+    out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
+    out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 4,
+    {
+        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Calculate the indirect Y for the source tensor
+    TILE(DATA_TYPE, 36, N0, in);
+    TILE(DATA_TYPE, 4, N0, tmp);
+    TILE(uint, 36, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 36,
+    {
+        in[i].v = 0;
+    })
+
+    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    LOOP_UNROLLING(int, i, 0, 1, 6,
+    {
+        tmp[0].v     = in[6 + i].v + in[12 + i].v;
+        tmp[1].v     = in[6 + i].v - in[12 + i].v;
+        tmp[2].v     = in[18 + i].v + in[24 + i].v;
+        tmp[3].v     = in[18 + i].v - in[24 + i].v;
+        tmp[3].v     = tmp[3].v + tmp[3].v;
+        in[i].v      = in[i].v + tmp[0].v + tmp[2].v;
+        in[6 + i].v  = tmp[3].v + tmp[1].v;
+        in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+        in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
+    })
+
+    // Compute the output tile
+    TILE(DATA_TYPE, 16, N0, out);
+
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        tmp[0].v         = in[6 * i + 1].v + in[6 * i + 2].v;
+        tmp[1].v         = in[6 * i + 1].v - in[6 * i + 2].v;
+        tmp[2].v         = in[6 * i + 3].v + in[6 * i + 4].v;
+        tmp[3].v         = in[6 * i + 3].v - in[6 * i + 4].v;
+        tmp[3].v         = tmp[3].v + tmp[3].v;
+        out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
+        out[4 * i + 1].v = tmp[3].v + tmp[1].v;
+        out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+        out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
+    })
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 16, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 4,
+        {
+            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nhwc(
+    TENSOR4D(src, BUFFER),
+    TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
+    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    TILE(DATA_TYPE, 8, N0, in);
+    TILE(DATA_TYPE, 4, N0, out);
+    TILE(DATA_TYPE, 4, N0, tmp);
+    TILE(uint, 8, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        in[i].v = 0;
+    })
+
+    // "in" contains 1x8 or 8x1 tile here
+    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // A^T * in, and in this degenerate case out consists of 1 column/row
+    tmp[0].v = in[1].v - in[2].v;
+    tmp[1].v = 2.0f * (in[3].v - in[4].v);
+    tmp[2].v = 2.0f * (in[5].v + in[6].v);
+    tmp[3].v = in[3].v + in[4].v;
+    out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
+    out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
+    out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
+    out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 4, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
+        dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    LOOP_UNROLLING(int, xk, 0, 1, 4,
+    {
+        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
+        dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+    })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+    // Calculate the indirect Y for the source tensor
+    TILE(DATA_TYPE, 64, N0, in);
+    TILE(DATA_TYPE, 6, N0, tmp);
+    TILE(uint, 64, 1, src_indirect_y);
+
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
+    })
+
+    // Initialize the input tile
+    LOOP_UNROLLING(int, i, 0, 1, 64,
+    {
+        in[i].v = 0;
+    })
+
+    // "in" here is 8x8 tile
+    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+    // A^T * in
+    LOOP_UNROLLING(int, i, 0, 1, 8,
+    {
+        tmp[0].v = in[8 + i].v + in[16 + i].v;
+        tmp[1].v = in[8 + i].v - in[16 + i].v;
+        tmp[2].v = in[24 + i].v + in[32 + i].v;
+        tmp[3].v = in[24 + i].v - in[32 + i].v;
+        tmp[3].v = tmp[3].v + tmp[3].v;
+        tmp[4].v = in[40 + i].v + in[48 + i].v;
+        tmp[4].v = tmp[4].v + tmp[4].v;
+        tmp[5].v = in[40 + i].v - in[48 + i].v;
+
+        // 4x8 matrix as a result
+        in[i].v      = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+        in[8 + i].v  = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+        in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
+        in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
+    })
+
+    // Compute the output tile
+    TILE(DATA_TYPE, 16, N0, out);
+
+    // in * A, with in = A^T * in as above
+    LOOP_UNROLLING(int, i, 0, 1, 4,
+    {
+        tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
+        tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
+        tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
+        tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
+        tmp[3].v = tmp[3].v + tmp[3].v;
+        tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
+        tmp[4].v = tmp[4].v + tmp[4].v;
+        tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
+
+        // 4x4 tile
+        out[4 * i].v     = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+        out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+        out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
+        out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
+    })
+
+#if defined(HAS_BIAS)
+    TILE(DATA_TYPE, 1, N0, b);
+
+    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+    // c = c + bias[broadcasted]
+    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+    TILE(uint, 16, 1, dst_indirect_y);
+
+    // Calculate the destination indirect Y
+    LOOP_UNROLLING(int, yk, 0, 1, 4,
+    {
+        LOOP_UNROLLING(int, xk, 0, 1, 4,
+        {
+            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
+            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
+            dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+        })
+    })
+
+    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
+ * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nhwc(
+    TENSOR4D_DECLARATION(src),
+    TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+    VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+    int dst_size)
+{
+    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+                                           src_stride_x,
+                                           src_step_x,
+                                           src_stride_y,
+                                           src_step_y,
+                                           src_stride_z,
+                                           src_step_z,
+                                           src_stride_w,
+                                           src_step_w,
+                                           src_offset_first_element_in_bytes,
+                                           dst_ptr,
+                                           dst_stride_x,
+                                           dst_step_x,
+                                           dst_stride_y,
+                                           dst_step_y,
+                                           dst_stride_z,
+                                           dst_step_z,
+                                           dst_stride_w,
+                                           dst_step_w,
+                                           dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+                                           bias_ptr,
+                                           bias_stride_x,
+                                           bias_step_x,
+                                           bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+                                           dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/nonmax.cl
deleted file mode 100644
index ab13131807..0000000000
--- a/src/core/CL/cl_kernels/nonmax.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function performs Non maxima suppression over a 3x3 window on a given image.
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: U8/F32
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p scr_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void non_max_suppression(
-    IMAGE_DECLARATION(src),
-    IMAGE_DECLARATION(dst))
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-    Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    vc = vload8(0, (__global DATA_TYPE *)src.ptr);
-
-    if(all(vc == (DATA_TYPE)0))
-    {
-        vstore8(0, 0, (__global DATA_TYPE *)dst.ptr);
-
-        return;
-    }
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    nc = vload16(0, (__global DATA_TYPE *)offset(&src, -1, -1));
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out = select((DATA_TYPE)0, vc, (vc >= nc.s01234567) && (vc >= nc.s12345678) && (vc >= nc.s23456789));
-
-    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, 0));
-    out = select((DATA_TYPE)0, out, (vc >= nc.s01234567) && (vc > nc.s23456789));
-
-    nc  = vload16(0, (__global DATA_TYPE *)offset(&src, -1, +1));
-    out = select((DATA_TYPE)0, out, (vc > nc.s01234567) && (vc > nc.s12345678) && (vc > nc.s23456789));
-
-    vstore8(out, 0, (__global DATA_TYPE *)dst.ptr);
-}
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/normalization_layer.cl
deleted file mode 100644
index 4569208824..0000000000
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ /dev/null
@@ -1,319 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#define MUL_OP(x, y) ((x) * (y))
-#define ADD_OP(x, y) ((x) + (y))
-#define DIV_OP(x, y) ((x) / (y))
-#define POW_OP(x, y) pow((x), (y))
-#define SQCVT_SAT(a) (a)
-
-#if defined(NUM_SLICES)
-/** Apply cross-map normalization.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
-                                                 TENSOR3D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
-
-    const int current_slice = get_global_id(2);
-    const int left_slice    = max(-(int)RADIUS, -current_slice);
-    const int right_slice   = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
-
-    for(int i = left_slice; i <= right_slice; i++)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
-        acc    = ADD_OP(acc, MUL_OP(values, values));
-    }
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
-
-    VSTORE(VEC_SIZE)
-    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif /* defined(NUM_SLICES) */
-
-#if defined(WIDTH_SIZE)
-/** Apply cross-map normalization.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input),
-                                                 TENSOR3D_DECLARATION(output))
-{
-    // Offset computation
-    const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
-
-    // Address computation
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = 0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = SQCVT_SAT(KAPPA);
-
-    const int left_slice  = max((int)0, (int)x_offs - (int)RADIUS);
-    const int right_slice = min((int)WIDTH_SIZE - 1, (int)x_offs + (int)RADIUS);
-
-    for(int i = left_slice; i <= right_slice; ++i)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * sizeof(DATA_TYPE)));
-        acc    = ADD_OP(acc, MUL_OP(values, values));
-    }
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x_offs * sizeof(DATA_TYPE))), normalized);
-
-    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** Apply in-map normalization when tensors are in the NCHW data layout format.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
-                                              TENSOR3D_DECLARATION(output))
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = 0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = SQCVT_SAT(KAPPA);
-
-    const int current_col = get_global_id(0) << 2;
-    const int left_pos    = max(-(int)RADIUS, -3 - current_col);
-    const int right_pos   = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
-
-#if defined(IN_MAP_2D)
-    const int current_row = get_global_id(1);
-    const int first_row   = max(-(int)RADIUS, -current_row);
-    const int last_row    = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
-#endif /* defined(IN_MAP_2D) */
-
-#if defined(IN_MAP_2D)
-    for(int j = first_row; j <= last_row; ++j)
-    {
-#endif /* defined(IN_MAP_2D) */
-        for(int i = left_pos; i <= right_pos; ++i)
-        {
-#if defined(IN_MAP_2D)
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
-#else  /* defined(IN_MAP_2D) */
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
-#endif /* defined(IN_MAP_2D) */
-            acc = ADD_OP(acc, MUL_OP(values, values));
-        }
-#if defined(IN_MAP_2D)
-    }
-#endif /* defined(IN_MAP_2D) */
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
-
-    VSTORE(VEC_SIZE)
-    (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif // defined(WIDTH_SIZE)
-
-#if defined(NUM_SLICES) && defined(DIM1_SIZE)
-/** Apply in-map normalization when tensors are in the NHWC data layout format.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
-                                              TENSOR3D_DECLARATION(output))
-{
-    // Offset computation
-    const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
-    const int current_cols = get_global_id(1);
-    const int current_rows = get_global_id(2);
-
-    // Address computation
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + current_cols * output_stride_y + current_rows * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    acc = 0;
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    coeff_v = SQCVT_SAT(COEFF);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    beta_v = SQCVT_SAT(BETA);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    kappa_v = SQCVT_SAT(KAPPA);
-
-    const int first_col    = max(0, current_cols - (int)RADIUS);
-    const int last_col     = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
-
-#if defined(IN_MAP_2D)
-    const int first_row = max(0, current_rows - (int)RADIUS);
-    const int last_row  = min((int)NUM_SLICES - 1, current_rows + (int)RADIUS);
-#endif /* defined(IN_MAP_2D) */
-
-#if defined(IN_MAP_2D)
-    for(int j = first_row; j <= last_row; ++j)
-    {
-#else  // defined(IN_MAP_2D)
-    const int j = current_rows;
-#endif /* defined(IN_MAP_2D) */
-        for(int i = first_col; i <= last_col; ++i)
-        {
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + i * input_stride_y + j * input_stride_z));
-            acc    = ADD_OP(acc, MUL_OP(values, values));
-        }
-#if defined(IN_MAP_2D)
-    }
-#endif /* defined(IN_MAP_2D) */
-
-    acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized = POW_OP(acc, beta_v);
-    const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows * output_stride_z)), normalized);
-
-    STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(NUM_SLICES) && defined(DIM1_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
deleted file mode 100644
index 0a098356b4..0000000000
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE)
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
-                                              TENSOR3D_DECLARATION(dst),
-                                              VECTOR_DECLARATION(mean),
-                                              VECTOR_DECLARATION(std))
-{
-    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
-
-    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
-
-    const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
-    const DATA_TYPE curr_std  = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
-
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
-    TYPE res  = (data - curr_mean) / curr_std;
-
-    VSTORE(VEC_SIZE)
-    (res, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
-                                              TENSOR3D_DECLARATION(dst),
-                                              VECTOR_DECLARATION(mean),
-                                              VECTOR_DECLARATION(std))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
-    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
-
-    const TYPE curr_mean = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
-    const TYPE curr_std  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr);
-
-    TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    TYPE res0 = (data - curr_mean) / curr_std;
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
deleted file mode 100644
index d660fffb58..0000000000
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
-
-#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define OFFSET_FLT ((float)OFFSET)
-#define SCALE_FLT ((float)SCALE)
-
-#if defined(NUM_CHANNELS)
-
-/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
- * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
- * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
-                                                 TENSOR3D_DECLARATION(dst),
-                                                 VECTOR_DECLARATION(mean),
-                                                 VECTOR_DECLARATION(std))
-{
-    Tensor3D src  = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst  = CONVERT_TO_TENSOR3D_STRUCT(dst);
-    Vector   mean = CONVERT_TO_VECTOR_STRUCT(mean);
-    Vector   std  = CONVERT_TO_VECTOR_STRUCT(std);
-
-    const uint current_slice = get_global_id(2) % NUM_CHANNELS;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
-    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
-    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
-    data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
-
-    // Perform normalization
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
-
-    const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
-    VSTORE(VEC_SIZE)
-    (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-#endif // defined(NUM_CHANNELS)
-
-/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
- * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- *
- * @param[in]  src_ptr                            Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  mean_ptr                           Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in]  mean_stride_x                      Stride of the mean source tensor in X dimension (in bytes)
- * @param[in]  mean_step_x                        mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in]  std_ptr                            Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in]  std_stride_x                       Stride of the std tensor in X dimension (in bytes)
- * @param[in]  std_step_x                         std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  std_offset_first_element_in_bytes  The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
-                                                 TENSOR3D_DECLARATION(dst),
-                                                 VECTOR_DECLARATION(mean),
-                                                 VECTOR_DECLARATION(std))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
-    __global uchar *src_addr  = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr  = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-    __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
-    __global uchar *std_addr  = std_ptr + std_offset_first_element_in_bytes + x_offs;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_mean_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr), VEC_DATA_TYPE(float, VEC_SIZE));
-    curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    curr_std_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)std_addr), VEC_DATA_TYPE(float, VEC_SIZE));
-    curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
-
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr), VEC_DATA_TYPE(float, VEC_SIZE));
-    data_flt = round(data_flt - OFFSET_FLT) * (SCALE_FLT);
-
-    // Perform normalization
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
-
-    const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
-    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/pad_layer.cl
deleted file mode 100644
index 903e924a2f..0000000000
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
-
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_SELECT SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-#define OFFSETS VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VEC_SIZE)
-#define SCALAR_COND(x) CONVERT((VEC_SELECT)x == (VEC_SELECT)1, VEC_SELECT)
-
-#if defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
-/** Perform a pad operation when PaddingMode is CONSTANT
- *
- * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
- * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
- * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
- * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
- * @note In case pad left is more than the vector size, the number of threads to skip along the X axis must be passed using the
- *       -DTHREADS_TO_SKIP_BEFORE compile flag, e.g. -DTHREADS_TO_SKIP_BEFORE=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
- * @note In case pad left is more than the vector size, the thread from which to skip along the X axis for pad right must be passed using the
- *       -DTHREADS_TO_SKIP_AFTER compile flag, e.g. -THREADS_TO_SKIP_AFTER=1. This is defined as ((SRC_WIDTH + PAD_X_BEFORE) / VEC_SIZE)
- * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
- *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
- * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
- *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
- * @note If pad also needs to be added to the batch of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g. -DPAD_W_BEFORE=3)
- *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
- */
-__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src),
-                                 TENSOR3D_DECLARATION(dst)
-#if defined(PAD_W_BEFORE)
-                                 ,
-                                 uint batch
-#endif // defined(PAD_W_BEFORE)
-                                )
-{
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int x = get_global_id(0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    // If true, write only padding values; no reads performed
-    uint cond = 0;
-#if defined(THREADS_TO_SKIP_BEFORE)
-    cond |= x < THREADS_TO_SKIP_BEFORE || x > THREADS_TO_SKIP_AFTER;
-#endif // defined(THREADS_TO_SKIP_BEFORE)
-#if defined(PAD_Y_BEFORE)
-    cond |= y < PAD_Y_BEFORE || y >= (SRC_HEIGHT + PAD_Y_BEFORE);
-#endif // defined(PAD_Y_BEFORE)
-#if defined(PAD_Z_BEFORE)
-    cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
-#endif // defined(PAD_Z_BEFORE)
-#if defined(PAD_W_BEFORE)
-    cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
-#endif // defined(PAD_W_BEFORE)
-
-    if(cond)
-    {
-        VEC_TYPE const_vals0 = (VEC_TYPE)CONST_VAL;
-        STORE_VECTOR_SELECT(const_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
-    }
-    else
-    {
-        // Calculate input's coordinates based on output's
-        int w = 0;
-#if defined(THREADS_TO_SKIP_BEFORE)
-        x -= THREADS_TO_SKIP_BEFORE;
-#endif // defined(THREADS_TO_SKIP_BEFORE)
-#if defined(PAD_Y_BEFORE)
-        y -= PAD_Y_BEFORE;
-#endif // defined(PAD_Y_BEFORE)
-#if defined(PAD_Z_BEFORE)
-        z -= PAD_Z_BEFORE;
-#endif // defined(PAD_Z_BEFORE)
-#if defined(PAD_W_BEFORE)
-        w -= PAD_W_BEFORE * SRC_DEPTH;
-#endif // defined(PAD_W_BEFORE)
-        x *= VEC_SIZE;
-        x -= PAD_X_BEFORE_REMAINDER;
-
-        // Check for out of bound reads and clamp X coordinate
-        uint cond_left  = x < 0;
-        uint cond_right = (x + VEC_SIZE) > SRC_WIDTH;
-        x               = clamp(x, 0, (SRC_WIDTH - VEC_SIZE));
-
-        // Calculate input's address
-        __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * src_stride_x + y * src_stride_y + z * src_stride_z + w * (int)src_stride_z;
-
-        // Read values and rotate them properly if they would have been across paddings
-        VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, PAD_X_BEFORE_REMAINDER), SCALAR_COND(cond_left));
-        src_vals0          = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, VEC_SIZE_LEFTOVER_READ), SCALAR_COND(cond_right));
-
-        // Check what values would be padding and replace them with the constant value
-        VEC_INT xs_out = (VEC_INT)(get_global_id(0) * VEC_SIZE) + VEC_OFFS(int, VEC_SIZE);
-        VEC_INT conds  = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
-        src_vals0      = select(src_vals0, (VEC_TYPE)CONST_VAL, CONVERT(conds, VEC_SELECT));
-
-        // Store values in bounds
-        STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
-    }
-}
-#endif // defined(CONST_VAL) && defined(VEC_SIZE_LEFTOVER_READ)
-
-#if defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
-
-#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
-#define SYMM_REFL_LEFT(x, n0, n1) select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
-#define SYMM_REFL_RIGHT(x, n0, n1) select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
-
-/** Perform a pad operation when PaddingMode is SYMMETRIC
- *
- * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
- * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
- * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g. -DPAD_X_BEFORE=5
- * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g. -DSRC_WIDTH=224
- * @note Number of values to the left when operating across left padding must be passed using the -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
- * @note Number of values to the left when operating across right padding must be passed using the -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
- * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
- * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
- * @note When after pad X, starting point to read backward from must be passed using the -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
- * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be set to 0
- * @note If pad also needs to be added to the top of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
- *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
- * @note If pad also needs to be added to the depth of the tensor, the following compile flags must be passed at compile time:
- *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g. -DPAD_Z_BEFORE=3)
- *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
- * @note If the starting point to read backward from is less than the output's last element accessed in the X, the following compile flags must be passed at compile time to avoid negative offsets:
- *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src),
-                                          TENSOR3D_DECLARATION(dst))
-{
-    // Get current thread position
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2);
-
-    // Define conditions based on the thread X position w.r.t. pad left and right
-    const int x_out_first         = x * VEC_SIZE;
-    const int x_out_last          = x_out_first + VEC_SIZE;
-    const int is_before_pad_left  = (x_out_last <= PAD_X_BEFORE);
-    const int is_across_pad_left  = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
-    const int is_inside_input     = (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
-    const int is_across_pad_right = (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
-    const int is_after_pad_right  = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
-
-    // Calculate base pointers
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
-    Tensor3D        dst      = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    // Calculate input tensor's offset based on the defined conditions
-    int x_offset = 0;
-    x_offset     = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
-    x_offset     = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
-    x_offset     = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
-    x_offset     = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
-
-#if defined(AFTER_PAD_REM)
-    int neg_offs = x_offset < 0;
-    x_offset     = max(x_offset, 0);
-#endif // defined(AFTER_PAD_REM)
-
-    // Load input values from the computed offset
-    int y_in = y;
-    int z_in = z;
-#if defined(PAD_Y_BEFORE)
-    y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
-    y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1, y >= (SRC_HEIGHT + PAD_Y_BEFORE));
-#endif // defined(PAD_Y_BEFORE)
-#if defined(PAD_Z_BEFORE)
-    z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
-    z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1, z >= (SRC_DEPTH + PAD_Z_BEFORE));
-#endif // defined(PAD_Y_BEFORE)
-
-    src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
-
-#if SRC_WIDTH == 1
-    VSTORE(VEC_SIZE)
-    ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
-#else // SRC_WIDTH == 1
-
-    VEC_TYPE src_vals0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-    // Choose rearrangement policy based on the defined conditions
-    src_vals0 = select(src_vals0, SYMM_REFL_LEFT(src_vals0, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL), SCALAR_COND(is_across_pad_left));
-    src_vals0 = select(src_vals0, SYMM_REFL_RIGHT(src_vals0, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL), SCALAR_COND(is_across_pad_right));
-    src_vals0 = select(src_vals0, REVERSE(src_vals0, VEC_SIZE), SCALAR_COND((is_before_pad_left || is_after_pad_right)));
-#if defined(AFTER_PAD_REM)
-    src_vals0 = select(src_vals0, ROTATE(src_vals0, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
-#endif // defined(AFTER_PAD_REM)
-
-    // Store values in bounds
-    STORE_VECTOR_SELECT(src_vals, DATA_TYPE, dst.ptr, VEC_SIZE, VEC_SIZE_LEFTOVER_WRITE, get_global_id(0) == (get_global_size(0) - 1));
-#endif // SRC_WIDTH == 1
-}
-#endif // defined(IS_REFLECT) && defined(PAD_X_AFTER_REMAINDER) && defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && defined(AFTER_PAD_FACT_X)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && defined(SRC_WIDTH) && defined(PAD_X_BEFORE_REMAINDER) && defined(VEC_SIZE_LEFTOVER_WRITE)
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
deleted file mode 100644
index db9e7ecc25..0000000000
--- a/src/core/CL/cl_kernels/permute.cl
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
-/**Perform a permute operation on an input tensor of Shape DCHW.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute(TENSOR4D_DECLARATION(input),
-                      TENSOR4D_DECLARATION(output))
-
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    int out_index[4] = { 0 };
-    int in_index[4]  = { 0 };
-
-    in_index[0] = get_global_id(0);            // W
-    in_index[1] = get_global_id(1);            // H
-    in_index[2] = get_global_id(2) % DEPTH_IN; // C
-    in_index[3] = get_global_id(2) / DEPTH_IN; // B
-
-    out_index[0] = in_index[P1];
-    out_index[1] = in_index[P2];
-    out_index[2] = in_index[P3];
-    out_index[3] = in_index[P4];
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/pixelwise_mul_float.cl
deleted file mode 100644
index 10875293a9..0000000000
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#ifdef SATURATE
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##_sat##round(x))
-#else /* SATURATE */
-#define CONVERT_OP_FLOAT_STR(x, type, round) (convert_##type##round(x))
-#endif /* SATURATE */
-#define CONVERT_OP_FLOAT(x, type, round) CONVERT_OP_FLOAT_STR(x, type, round)
-
-#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
-
-#if defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-#endif // defined(ACTIVATION_TYPE)
-
-#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
-#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
-
-/** Performs a pixelwise multiplication with float scale of either integer or float inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
- * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
- * @attention -DDATA_TYPE_FLOAT must be passed if floating point inputs are provided.
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: U8, S16, F16, F32
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16, F16, F32
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_mul_float(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-#if !defined(IN_PLACE)
-    TENSOR3D_DECLARATION(out),
-#endif // !defined(IN_PLACE)
-    const float scale)
-{
-    // Get pixels pointer
-    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-    size_t y = get_global_id(1);
-    size_t z = get_global_id(2);
-
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
-    __global        uchar *
-#if !defined(IN_PLACE)
-    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
-#else // !defined(IN_PLACE)
-#if defined(SRC1_IN_PLACE)
-    out_addr      = in1_addr;
-#else  //defined(SRC1_IN_PLACE)
-    out_addr = in2_addr;
-#endif //defined(SRC1_IN_PLACE)
-#endif // !defined(IN_PLACE)
-
-    // Load data
-    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr)), VEC_ACC_TYPE);
-    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr)), VEC_ACC_TYPE);
-
-    // Perform multiplication
-#ifdef DATA_TYPE_FLOAT
-    VEC_OUT_TYPE res0 = CONVERT(in1_data * in2_data * (ACC_DATA_TYPE)scale, VEC_OUT_TYPE);
-#else  /* DATA_TYPE_FLOAT */
-    VEC_OUT_TYPE res0 = CONVERT_OP_FLOAT(CONVERT_OP_FLOAT((CONVERT(in1_data * in2_data, VEC_FLOAT) * scale), VEC_ACC_TYPE, ROUND), VEC_OUT_TYPE, ROUND);
-#endif /* DATA_TYPE_FLOAT */
-
-#if defined(ACTIVATION_TYPE)
-    res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE_OUT, VEC_SIZE_OUT, res0, A_VAL, B_VAL);
-#endif // defined(ACTIVATION_TYPE)
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
-
-#if defined(DATA_TYPE)
-
-/** Performs a pixelwise multiplication of complex float values
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: F16/F32
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pixelwise_mul_complex(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pixels pointer
-    Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-    Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    vin1 = vload2(0, (__global DATA_TYPE *)in1.ptr);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    vin2 = vload2(0, (__global DATA_TYPE *)in2.ptr);
-
-    // Perform complex multiplication
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    res = { vin1.x *vin2.x - vin1.y * vin2.y, vin1.x *vin2.y + vin2.x * vin1.y };
-
-#if defined(ACTIVATION_TYPE)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE_OUT, res, A_VAL, B_VAL), 0, (__global DATA_TYPE *)out.ptr);
-#else  // defined(ACTIVATION_TYPE)
-    // Store result
-    vstore2(res, 0, (__global DATA_TYPE *)out.ptr);
-#endif // defined(ACTIVATION_TYPE)
-}
-
-#endif // defined(DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl
deleted file mode 100644
index 6d1c2d0c79..0000000000
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(SATURATE)
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x))
-#else // SATURATE
-#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x))
-#endif // SATURATE
-#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size)
-
-#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size)
-
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-
-#if defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT)
-
-#define VEC_ACC_TYPE VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE_OUT)
-#define VEC_OUT_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-
-/** Performs a pixelwise multiplication with integer scale of integer inputs.
- *
- * @attention The inputs and output data types need to be passed at compile time using -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short
- * @attention The data_type of the intermediate result of the multiplication should passed as well using -DACC_DATA_TYPE.
- * e.g. If one of inputs is S16 -DACC_DATA_TYPE=int should be passed else -DACC_DATA_TYPE=short.
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: U8/S16
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Integer scaling factor. Supported data types: S32.
- */
-__kernel void pixelwise_mul_int(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-#if !defined(IN_PLACE)
-    TENSOR3D_DECLARATION(out),
-#endif // !defined(IN_PLACE)
-    const uint scale)
-{
-    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-    size_t y = get_global_id(1);
-    size_t z = get_global_id(2);
-
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
-    __global        uchar *
-#if !defined(IN_PLACE)
-    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
-#else // !defined(IN_PLACE)
-#if defined(SRC1_IN_PLACE)
-    out_addr            = in1_addr;
-#else  //defined(SRC1_IN_PLACE)
-    out_addr = in2_addr;
-#endif //defined(SRC1_IN_PLACE)
-#endif // !defined(IN_PLACE)
-
-    // Load data
-    VEC_ACC_TYPE in1_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN1, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_IN1 *)in1_addr), VEC_ACC_TYPE);
-    VEC_ACC_TYPE in2_data = CONVERT((VEC_DATA_TYPE(DATA_TYPE_IN2, VEC_SIZE_OUT))VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_IN2 *)in2_addr), VEC_ACC_TYPE);
-    // Perform multiplication and store result
-    VEC_OUT_TYPE out_data0 = MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, VEC_SIZE_OUT);
-    STORE_VECTOR_SELECT(out_data, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DATA_TYPE_IN1) && defined(DATA_TYPE_IN2) && defined(ACC_DATA_TYPE) && defined(DATA_TYPE_OUT) */
-
-#if defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE_OUT)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE_OUT)
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE_OUT)
-
-/** Performs a pixelwise multiplication with float scale of quantized inputs.
- *
- * @note The quantization offset of the first operand must be passed at compile time only if asymmetric using -DOFFSET_IN1, e.g. -DOFFSET_IN1=10
- * @note The quantization offset of the second operand must be passed at compile time only if asymmetric using -DOFFSET_IN2, e.g. -DOFFSET_IN2=10
- * @note The quantization offset of the output must be passed at compile time only if asymmetric using -DOFFSET_OUT, e.g. -DOFFSET_OUT=10
- * @note The quantization scale of the first operand must be passed at compile time using -DSCALE_IN1, e.g. -DSCALE_IN1=10
- * @note The quantization scale of the second operand must be passed at compile time using -DSCALE_IN2, e.g. -DSCALE_IN2=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, e.g. -DSCALE_OUT=10
- * @note To perform saturating operation -DSATURATE has to be passed to the compiler otherwise wrapping policy will be used.
- * @attention The data type must be passed at compile time using -DDATA_TYPE_OUT, i.e. -DDATA_TYPE_OUT=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in]  in1_ptr                           Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM16
- * @param[in]  in1_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in]  in2_ptr                           Pointer to the source image. Supported data types: same as @p in1_ptr
- * @param[in]  in2_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in1_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  scale                             Float scaling factor. Supported data types: F32
- */
-__kernel void pixelwise_mul_quantized(
-    TENSOR3D_DECLARATION(in1),
-    TENSOR3D_DECLARATION(in2),
-#if !defined(IN_PLACE)
-    TENSOR3D_DECLARATION(out),
-#endif // !defined(IN_PLACE)
-    const float scale)
-{
-    size_t x = max((int)(get_global_id(0) * VEC_SIZE_OUT - (VEC_SIZE_OUT - VEC_SIZE_LEFTOVER) % VEC_SIZE_OUT), 0);
-    size_t y = get_global_id(1);
-    size_t z = get_global_id(2);
-
-    __global uchar *in1_addr = in1_ptr + in1_offset_first_element_in_bytes + x * in1_stride_x + y * in1_stride_y + z * in1_stride_z;
-    __global uchar *in2_addr = in2_ptr + in2_offset_first_element_in_bytes + x * in2_stride_x + y * in2_stride_y + z * in2_stride_z;
-    __global        uchar *
-#if !defined(IN_PLACE)
-    out_addr = out_ptr + out_offset_first_element_in_bytes + x * out_stride_x + y * out_stride_y + z * out_stride_z;
-#else // !defined(IN_PLACE)
-#if defined(SRC1_IN_PLACE)
-    out_addr            = in1_addr;
-#else  //defined(SRC1_IN_PLACE)
-    out_addr = in2_addr;
-#endif //defined(SRC1_IN_PLACE)
-#endif // !defined(IN_PLACE)
-
-    // Load data
-    VEC_INT in_a = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN1)(0, (__global DATA_TYPE_OUT *)in1_addr)), VEC_INT);
-    VEC_INT in_b = CONVERT((VEC_TYPE)(VLOAD(VEC_SIZE_IN2)(0, (__global DATA_TYPE_OUT *)in2_addr)), VEC_INT);
-
-    // Dequantize
-#if defined(OFFSET_IN1)
-    in_a -= (VEC_INT)((int)OFFSET_IN1);
-#endif // defined(OFFSET_IN1)
-#if defined(OFFSET_IN2)
-    in_b -= (VEC_INT)((int)OFFSET_IN2);
-#endif // defined(OFFSET_IN2)
-    const VEC_FLOAT in1f32 = CONVERT(in_a, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN1);
-    const VEC_FLOAT in2f32 = CONVERT(in_b, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN2);
-
-#if defined(OFFSET_OUT)
-    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFFSET_OUT));
-#else  // defined(OFFSET_OUT)
-    const VEC_FLOAT qresf32 = (in1f32 * in2f32 * scale) / ((VEC_FLOAT)(float)SCALE_OUT);
-#endif // defined(OFFSET_OUT)
-    const VEC_TYPE res0 = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_TYPE);
-
-    // Store result
-    STORE_VECTOR_SELECT(res, DATA_TYPE_OUT, out_addr, VEC_SIZE_OUT, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT) && defined(DATA_TYPE_OUT) && defined(VEC_SIZE_OUT) */
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
deleted file mode 100644
index d63a2e51e8..0000000000
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "repeat.h"
-#include "tile_helpers.h"
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define POOL_OP(x, y) ((x) + (y))
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define POOL_OP(x, y) (fmax((x), (y)))
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) ((x) * (x))
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-#define SQRT_OP(x) sqrt((x))
-
-#if STRIDE_X == 1
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
-#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
-#endif /* STRIDE_X == 3 */
-
-#if defined(FP_MIXED_PRECISION)
-#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
-    CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
-#else /* defined(FP_MIXED_PRECISION) */
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
-#endif /* defined(FP_MIXED_PRECISION) */
-
-#define POOLING3x3_STRIDE1(res, input, output)                                                                                                       \
-    ({                                                                                                                                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4);                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4);                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                                   \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 2)                                                                                                              \
-        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4);                               \
-        data00 = POW2_OP(data00, 4);                                                                                                                 \
-        data01 = POW2_OP(data01, 2);                                                                                                                 \
-        data10 = POW2_OP(data10, 4);                                                                                                                 \
-        data11 = POW2_OP(data11, 2);                                                                                                                 \
-        data20 = POW2_OP(data20, 4);                                                                                                                 \
-        data21 = POW2_OP(data21, 2);                                                                                                                 \
-        \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01);                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01);                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01);                                                              \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                                      \
-        values01 = POOL_OP(values01, values11);                                                                                                      \
-        values00 = POOL_OP(values00, values20);                                                                                                      \
-        values01 = POOL_OP(values01, values21);                                                                                                      \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
-    })
-
-#define POOLING3x3_STRIDE2(res, input, output)                                                                                                       \
-    ({                                                                                                                                               \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data00               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                     \
-        ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8));                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data10               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                     \
-        ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8));                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        data20               = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                     \
-        ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8));                                       \
-        data00               = POW2_OP(data00, 8);                                                                                                   \
-        data01               = POW2_OP(data01, 1);                                                                                                   \
-        data10               = POW2_OP(data10, 8);                                                                                                   \
-        data11               = POW2_OP(data11, 1);                                                                                                   \
-        data20               = POW2_OP(data20, 8);                                                                                                   \
-        data21               = POW2_OP(data21, 1);                                                                                                   \
-        \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01);                                                                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11);                                                                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                              \
-        values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445);                                                                              \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                              \
-        values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21);                                                                           \
-        \
-        values00 = POOL_OP(values00, values10);                                                                                                      \
-        values01 = POOL_OP(values01, values11);                                                                                                      \
-        values00 = POOL_OP(values00, values20);                                                                                                      \
-        values01 = POOL_OP(values01, values21);                                                                                                      \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03));                                                           \
-    })
-
-#define POOLING3x3_STRIDE3(res, input, output)                                                                                               \
-    ({                                                                                                                                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8);                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8);                       \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 8)                                                                                                      \
-        data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));                           \
-        VEC_DATA_TYPE(ACC_DATA_TYPE, 4)                                                                                                      \
-        data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8);                       \
-        data00 = POW2_OP(data00, 8);                                                                                                         \
-        data01 = POW2_OP(data01, 4);                                                                                                         \
-        data10 = POW2_OP(data10, 8);                                                                                                         \
-        data11 = POW2_OP(data11, 4);                                                                                                         \
-        data20 = POW2_OP(data20, 8);                                                                                                         \
-        data21 = POW2_OP(data21, 4);                                                                                                         \
-        \
-        data00 = POOL_OP(data00, data10);                                                                                                    \
-        data01 = POOL_OP(data01, data11);                                                                                                    \
-        data00 = POOL_OP(data00, data20);                                                                                                    \
-        data01 = POOL_OP(data01, data21);                                                                                                    \
-        \
-        res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
-        res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03));                                                       \
-    })
-
-ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                  const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = get_global_id(0) * stride_x - pad_x;
-    int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_2(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 2);
-    data1 = POW2_OP(data1, 2);
-#endif /* defined(POOL_L2) */
-
-    // Perform calculations
-    data0             = POOL_OP(data0, data1);
-    ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average or l2 pooling
-    res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-/** Performs a pooling function of pool size equal to 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_3(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    // Load data
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
-    data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 = POW2_OP(data0, 3);
-    data1 = POW2_OP(data1, 3);
-    data2 = POW2_OP(data2, 3);
-#endif /* defined(POOL_L2) */
-
-    // Perform calculations
-    data0             = POOL_OP(data0, data1);
-    data0             = POOL_OP(data0, data2);
-    ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-#if defined(POOLING3x3)
-
-#define CONVERT_OP(data_type) convert_##data_type##4
-#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
-
-VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                     const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int4       start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
-    int        start_y = get_global_id(1) * stride_y - pad_y;
-    const int4 end_x   = min(start_x + (int4)pool_size, (int4)upper_bound_w);
-    const int  end_y   = min(start_y + pool_size, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max((int4)0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
-}
-
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_optimized_3(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    res;
-
-    // Perform pooling 3x3 for 4 output elements
-    POOLING3x3(res, input, output);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(POOLING3x3)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-/** Performs a pooling function of pool size equal to N  (NCHW)
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-    vdata               = INITIAL_VALUE;
-    ACC_DATA_TYPE sdata = INITIAL_VALUE;
-
-    // Load data
-    for(int y = 0; y < POOL_SIZE_Y; y++)
-    {
-        int x = 0;
-        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
-        {
-            VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
-            data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            vdata = POOL_OP(vdata, data0);
-        }
-
-        // Leftover
-        for(; x < (int)POOL_SIZE_X; ++x)
-        {
-            ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif /* defined(POOL_L2) */
-            sdata = POOL_OP(sdata, data0);
-        }
-    }
-
-    // Reduce result
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-    reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
-    reduce2           = POOL_OP(reduce4.s01, reduce4.s23);
-    ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-    res               = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    // Divide by pool region in case of average pooling
-    res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
-    // Store result
-    *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
-{
-    const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
-    const int pad_vert  = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
-
-    const int x = get_global_id(0) * STRIDE_X;
-    const int y = get_global_id(1) * STRIDE_Y;
-    const int z = get_global_id(2);
-
-    //x axis: width, y axis: height, z axis: component
-    const uint padded_offset = input->offset_first_element_in_bytes
-                               + x * input->stride_x
-                               + y * input->stride_y
-                               + z * input->stride_z;
-
-    const uint offset_base = padded_offset
-                             - y * pad_horiz * sizeof(DATA_TYPE)                                               /* Horizontal padding for each row */
-                             - PAD_TENSOR_TOP * input->stride_y                                                /* top padding */
-                             - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
-                             - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
-    *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
-#else  /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-    *offset_top   = (uint)(offset_base / sizeof(DATA_TYPE));
-#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-
-    *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
-    return;
-}
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp32(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    // Load data
-    float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
-    float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Perform calculations
-    float data0_max = POOL_OP(data0.s0, data0.s1);
-    float data1_max = POOL_OP(data1.s0, data1.s1);
-    float res       = POOL_OP(data0_max, data1_max);
-    // Store result
-    *(__global float *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-    uint offset_top    = 0;
-    uint offset_bottom = 0;
-
-    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
-    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
-    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
-    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
-    *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp16(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    // Get pixels pointer
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    // Load data
-    half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
-    half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
-
-    // Perform calculations
-    half data0_max = POOL_OP(data0.s0, data0.s1);
-    half data1_max = POOL_OP(data1.s0, data1.s1);
-    half res       = POOL_OP(data0_max, data1_max);
-    // Store result
-    *(__global half *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-    uint offset_top    = 0;
-    uint offset_bottom = 0;
-
-    offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
-    uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
-    uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
-    uint index  = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
-    *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
-    int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
-    int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
-#else  //DST_BATCH_SIZE != 1
-    int idx_out_h   = GET_SPATIAL_IDX(2, 1, 0);
-    int idx_out_n   = 0;
-#endif // DST_BATCH_SIZE != 1
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
-                                           output_stride_w;
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = INITIAL_VALUE;
-
-    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
-
-#if defined(EXCLUDE_PADDING)
-    int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
-#else  // defined(EXCLUDE_PADDING)
-    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
-#endif // defined(EXCLUDE_PADDING)
-
-#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
-    // Global pooling path
-    for(int y = 0; y < POOL_SIZE_Y; ++y)
-    {
-#pragma unroll 8
-        for(int x = 0; x < POOL_SIZE_X; ++x)
-        {
-#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
-    for(int y = pool_y_s; y < pool_y_e; ++y)
-    {
-#pragma unroll 8
-        for(int x = pool_x_s; x < pool_x_e; ++x)
-        {
-#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
-            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-            data0;
-#if defined(FP_MIXED_PRECISION)
-            // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-            data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else  // defined(FP_MIXED_PRECISION)
-            data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(POOL_L2)
-            // Raise to power of 2 for L2 Pooling
-            data0 *= data0;
-#endif // defined(POOL_L2)
-            res0 = POOL_OP(res0, data0);
-        }
-    }
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
-    // Store result
-#if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else  // defined(FP_MIXED_PRECISION)
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-
-/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                        Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                          input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                       Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                         output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[in]  indices_ptr                           (Optional) Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      (Optional) Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      (Optional) Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      (Optional) Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_stride_w                      (Optional) Stride of the indices tensor in W dimension (in bytes)
- * @param[in]  indices_step_w                        (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2x2_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output)
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-    ,
-    TENSOR4D_DECLARATION(indices)
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-)
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int idx_out_w = get_global_id(1);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = get_global_id(2) % DST_HEIGHT;
-    int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else  //SRC_BATCH_SIZE != 1
-    int idx_out_h = get_global_id(2);
-    int idx_out_n = 0;
-#endif // SRC_BATCH_SIZE != 1
-
-    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
-                                           output_stride_w;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
-
-    int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
-
-    int x0 = pool_x_s + idx_in_w;
-    int y0 = pool_y_s + idx_in_h;
-    int x1 = pool_x_e - 1 + idx_in_w;
-    int y1 = pool_y_e - 1 + idx_in_h;
-
-    REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
-
-#if defined(FP_MIXED_PRECISION)
-    // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
-    data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-    data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else  // defined(FP_MIXED_PRECISION)
-    data0         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
-    data1         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
-    data2         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
-    data3         = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if !defined(POOL_MAX)
-    if(filter_size != 4)
-    {
-        SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
-        SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
-        SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
-        SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
-
-        // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
-        data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
-        data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
-        data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
-        data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
-    }
-#endif // !defined(POOL_MAX)
-
-#if defined(POOL_L2)
-    // Raise to power of 2 for L2 Pooling
-    data0 *= data0;
-    data1 *= data1;
-    data2 *= data2;
-    data3 *= data3;
-#endif /* defined(POOL_L2) */
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = data0;
-    res0 = POOL_OP(res0, data1);
-    res0 = POOL_OP(res0, data2);
-    res0 = POOL_OP(res0, data3);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#if defined(EXCLUDE_PADDING)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#else  // !defined(EXCLUDE_PADDING)
-    res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
-#endif // defined(EXCLUDE_PADDING)
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
-    // Take square root of the result in L2 pooling
-    res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
-    // Store result
-#if defined(FP_MIXED_PRECISION)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else  // defined(FP_MIXED_PRECISION)
-    STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-
-    // This part is used to return the index of the maximum value
-    // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
-
-    // note: Batch dimension does not contribute in the offset contribution
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    base_index = (uint)idx_out_c;
-
-    base_index += VEC_OFFS(uint, VEC_SIZE);
-
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-    VEC_DATA_TYPE(uint, VEC_SIZE)
-    index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-
-    index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
-    index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
-    index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
-
-    __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
-                                           indices_stride_w;
-
-    // Store result
-    STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
deleted file mode 100644
index d8cef2b4e6..0000000000
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
-#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-#define VEC_FLOAT(VEC_SIZE) VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT(VEC_SIZE) VEC_DATA_TYPE(int, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-#define REQUANTIZE(VEC_SIZE, input, in_offset, out_offset, in_scale, out_scale, res)                                                                                  \
-    {                                                                                                                                                                 \
-        const VEC_FLOAT(VEC_SIZE) in_f32  = (CONVERT(input, VEC_FLOAT(VEC_SIZE)) - (VEC_FLOAT(VEC_SIZE))((float)in_offset)) * (VEC_FLOAT(VEC_SIZE))((float)in_scale); \
-        const VEC_FLOAT(VEC_SIZE) out_f32 = in_f32 / ((VEC_FLOAT(VEC_SIZE))(float)out_scale) + ((VEC_FLOAT(VEC_SIZE))((float)out_offset));                            \
-        res                               = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT(VEC_SIZE)), VEC_TYPE(VEC_SIZE));                                                \
-    }
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-#if defined(POOL_AVG)
-#define POOL_OP(x, y) ((x) + (y))
-#else /* defined(POOL_AVG) */
-#define POOL_OP(x, y) (max((x), (y)))
-#endif /* defined(POOL_AVG) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-
-#if defined(POOL_L2)
-#error "L2 pooling is not supported"
-#endif /* defined(POOL_L2) */
-
-int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                        const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
-    int       start_x = get_global_id(0) * stride_x - pad_x;
-    int       start_y = get_global_id(1) * stride_y - pad_y;
-    const int end_x   = min(start_x + pool_size_x, upper_bound_w);
-    const int end_y   = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
-    start_x = max(0, start_x);
-    start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
-    return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NCHW)
- *
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- *       -DPOOL_AVG must be provided otherwise max pooling will be performed.
- *       -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- *       -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- *       -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_MxN_quantized_nchw(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int8 vdata = INITIAL_VALUE;
-    int  sdata = INITIAL_VALUE;
-
-    // Load data
-    for(int y = 0; y < POOL_SIZE_Y; y++)
-    {
-        int x = 0;
-        for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
-        {
-            VEC_TYPE(8)
-            data       = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            int8 data0 = convert_int8(data);
-            vdata      = POOL_OP(vdata, data0);
-        }
-
-        // Leftover
-        for(; x < (int)POOL_SIZE_X; ++x)
-        {
-            DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-            int data0      = convert_int(data);
-            sdata          = POOL_OP(sdata, data0);
-        }
-    }
-
-    // Reduce result
-    int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
-    int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
-    int  res     = POOL_OP(reduce2.s0, reduce2.s1);
-    res          = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG)
-    res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
-#endif /* defined(POOL_AVG) */
-
-    DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-
-    const float result_f32   = convert_float(result_q8);
-    const float input_offset = (float)OFFSET_IN1;
-    const float input_scale  = (float)SCALE_IN1;
-    const float scale_out    = (float)SCALE_OUT;
-    const float offset_out   = (float)OFFSET_OUT;
-    const float in_f32       = (result_f32 - input_offset) * input_scale;
-    const float out_f32      = in_f32 / scale_out + offset_out;
-    result_q8                = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
-
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-    *(__global DATA_TYPE *)output.ptr = result_q8;
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=uchar. Supported data types are QASYMM8/QASYMM8_SIGNED
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=int
- * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- * @note If the output has be requantized, -DOFFSET_IN1, -DOFFSET_OUT, -DSCALE_IN1 and -DSCALE_OUT muste be passed at compile time
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_MxN_quantized_nhwc(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
-    // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
-    int offset_c  = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0) * sizeof(DATA_TYPE);
-    int idx_out_w = get_global_id(1);
-#if DST_BATCH_SIZE != 1
-    // If batch size != 1, the batch size dimension is collapsed over the height dimension
-    int idx_out_h = get_global_id(2) % DST_HEIGHT;
-    int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else  //DST_BATCH_SIZE != 1
-    int idx_out_h   = get_global_id(2);
-    int idx_out_n   = 0;
-#endif // DST_BATCH_SIZE != 1
-
-    int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
-    int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
-    __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + offset_c + idx_out_n * input_stride_w;
-
-    __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + offset_c + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * output_stride_w;
-
-    int pool_x_s = max((int)0, -idx_in_w);
-    int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
-    int pool_y_s = max((int)0, -idx_in_h);
-    int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
-
-#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
-    int filter_size = 0;
-#elif defined(POOL_AVG) && !defined(EXCLUDE_PADDING) // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
-    int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
-#endif                                               // defined(POOL_AVG) && !defined(EXCLUDE_PADDING)
-
-    VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-    res0 = INITIAL_VALUE;
-
-    for(int y = pool_y_s; y < pool_y_e; ++y)
-    {
-        for(int x = pool_x_s; x < pool_x_e; ++x)
-        {
-            VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-            data;
-            VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-            data0;
-
-            data  = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
-            data0 = CONVERT(data, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-
-            res0 = POOL_OP(res0, data0);
-
-#if defined(POOL_AVG) && defined(EXCLUDE_PADDING)
-            filter_size++;
-#endif // defined(POOL_AVG) && defined(EXCLUDE_PADDING)
-        }
-    }
-
-#if defined(POOL_AVG)
-    res0 = (res0 + (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))(filter_size >> 1)) / filter_size;
-#endif // defined(POOL_AVG)
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    out_q0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-    REQUANTIZE(VEC_SIZE, out_q0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT, out_q0);
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
-    // Store result
-    STORE_VECTOR_SELECT(out_q, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/prior_box_layer.cl
deleted file mode 100644
index de10decdec..0000000000
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * Copyright (c) 2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3)
-
-/**  Compute prior boxes and clip (NCHW)
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx                                  Index to write to
- * @param[in]  center_x                             Center value of the x axis
- * @param[in]  center_y                             Center value of the y axis
- * @param[in]  box_width                            Prior box width
- * @param[in]  box_height                           Prior box height
- *
- */
-inline void calculate_xy_min_max_nchw(Image *out, int idx, float center_x, float center_y, float box_width, float box_height)
-{
-    float xmin = (center_x - box_width / 2.f) / WIDTH;
-    float ymin = (center_y - box_height / 2.f) / HEIGHT;
-    float xmax = (center_x + box_width / 2.f) / WIDTH;
-    float ymax = (center_y + box_height / 2.f) / HEIGHT;
-
-#if defined(CLIP)
-    xmin = clamp(xmin, 0.f, 1.f);
-    ymin = clamp(ymin, 0.f, 1.f);
-    xmax = clamp(xmax, 0.f, 1.f);
-    ymax = clamp(ymax, 0.f, 1.f);
-#endif // defined(CLIP)
-
-    // Store result
-    vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(xmin, ymin, xmax, ymax), 0, ((__global DATA_TYPE *)offset(out, idx + 0, 0)));
-}
-
-/** Compute prior boxes (NCHW)
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  min_size                             Prior box min size
- * @param[in]  min_idx                              Index of the min vector
- * @param[in]  idx                                  Index to write to
- *
- * @return The updated index
- */
-inline int calculate_min_nchw(Image *out, __global float *max, __global float *aspect_ratios, int max_size, int aspect_ratios_size, float min_size, int min_idx, int idx)
-{
-    const float center_x = ((float)(get_global_id(0) % LAYER_WIDTH) + OFFSET) * STEP_X;
-    const float center_y = ((float)(get_global_id(0) / LAYER_WIDTH) + OFFSET) * STEP_Y;
-
-    float box_width  = min_size;
-    float box_height = min_size;
-    calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
-    idx += 4;
-
-    if(max_size > 0)
-    {
-        box_width  = sqrt(min_size * max[min_idx]);
-        box_height = box_width;
-        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
-        idx += 4;
-    }
-    for(unsigned int i = 0; i < aspect_ratios_size; ++i)
-    {
-        if(fabs(aspect_ratios[i] - 1.f) < 1e-6f)
-        {
-            continue;
-        }
-        box_width  = min_size * sqrt(aspect_ratios[i]);
-        box_height = min_size * rsqrt(aspect_ratios[i]);
-
-        calculate_xy_min_max_nchw(out, idx, center_x, center_y, box_width, box_height);
-        idx += 4;
-    }
-
-    return idx;
-}
-/** Calculate prior boxes with NCHW format.
- *
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: F32
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  min                                  The minimum values
- * @param[in]  max                                  The maximum_values
- * @param[in]  aspect_ratios                        The aspect ratio values
- * @param[in]  min_size                             The minimum values size
- * @param[in]  max_size                             The maximum_values values size
- * @param[in]  aspect_ratios_size                   The aspect ratio values size
- */
-__kernel void prior_box_layer_nchw(IMAGE_DECLARATION(output), __global float *min, __global float *max, __global float *aspect_ratios, unsigned int min_size, unsigned int max_size,
-                                   unsigned int aspect_ratios_size)
-{
-    Image out = CONVERT_TO_IMAGE_STRUCT(output);
-
-    int idx = 0;
-    for(unsigned int i = 0; i < min_size; ++i)
-    {
-        idx = calculate_min_nchw(&out, max, aspect_ratios, max_size, aspect_ratios_size, min[i], i, idx);
-    }
-
-    // Store variances
-    for(int i = 0; i < (NUM_PRIORS * 4); i += 4)
-    {
-        vstore4((VEC_DATA_TYPE(DATA_TYPE, 4))(VARIANCE_0, VARIANCE_1, VARIANCE_2, VARIANCE_3), 0, ((__global DATA_TYPE *)offset(&out, i, 1)));
-    }
-}
-#endif /* defined(DATA_TYPE) && defined(WIDTH) && defined(HEIGHT) && defined(LAYER_WIDTH) && defined(LAYER_HEIGHT) && defined(OFFSET) && defined(STEP_X) && defined(STEP_Y) && defined(NUM_PRIORS) && defined(VARIANCE_0) && defined(VARIANCE_1) && defined(VARIANCE_2) && defined(VARIANCE_3) */
diff --git a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
deleted file mode 100644
index 24cb111772..0000000000
--- a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
+++ /dev/null
@@ -1,260 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#if VEC_SIZE == 2
-#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 2)
-#define PERFORM_REDUCTION_IMPL(type)                                                   \
-    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 2) sum) \
-    {                                                                                  \
-        sum.s0 += sum.s1;                                                              \
-        return sum.s0;                                                                 \
-    }
-#elif VEC_SIZE == 4
-#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 4)
-#define PERFORM_REDUCTION_IMPL(type)                                                   \
-    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 4) sum) \
-    {                                                                                  \
-        sum.s01 += sum.s23;                                                            \
-        sum.s0 += sum.s1;                                                              \
-        return sum.s0;                                                                 \
-    }
-#elif VEC_SIZE == 8
-#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 8)
-#define PERFORM_REDUCTION_IMPL(type)                                                   \
-    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 8) sum) \
-    {                                                                                  \
-        sum.s0123 += sum.s4567;                                                        \
-        sum.s01 += sum.s23;                                                            \
-        sum.s0 += sum.s1;                                                              \
-        return sum.s0;                                                                 \
-    }
-#else /* VEC_SIZE DEFAULT */
-#define VEC_SIZE 16
-#define multiply_by_quantized_multiplier(input, qmul, shift) MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, 16)
-#define PERFORM_REDUCTION_IMPL(type)                                                    \
-    inline VEC_DATA_TYPE(type, 1) perform_reduction_##type(VEC_DATA_TYPE(type, 16) sum) \
-    {                                                                                   \
-        sum.s01234567 += sum.s89abcdef;                                                 \
-        sum.s0123 += sum.s4567;                                                         \
-        sum.s01 += sum.s23;                                                             \
-        sum.s0 += sum.s1;                                                               \
-        return sum.s0;                                                                  \
-    }
-#endif /* VEC_SIZE END */
-
-#define PERFORM_REDUCTION_STR(input, type) perform_reduction_##type(input)
-#define PERFORM_REDUCTION(input, type) PERFORM_REDUCTION_STR(input, type)
-
-PERFORM_REDUCTION_IMPL(int)
-PERFORM_REDUCTION_IMPL(long)
-
-/** Compute quantized multiplier and shift for the inverse square root of input.
- *  Using 3-bit fixed point and 5 iteration of Newton-Raphson method.
- *
- * @param[in] in            Input to use
- * @param[in] reverse_shift -1 to reverse the shift direction
- *
- * @return:
- *             .s0  Quantized multiplier for inverse square root
- *             .s1  Shift for inverse square root
- *
- */
-inline int2 get_invsqrt_quantized_multiplier_exp(int in, int reverse_shift)
-{
-    int2 stddev_inv;
-    int  stddev_inv_multiplier = INT_MAX;
-    int  stddev_inv_shift      = 0;
-    int  input                 = in;
-    if(input <= 1)
-    {
-        stddev_inv.s0 = stddev_inv_multiplier;
-        stddev_inv.s1 = stddev_inv_shift;
-        return stddev_inv;
-    }
-
-    stddev_inv_shift = 11;
-    while(input >= (1 << 29))
-    {
-        input /= 4;
-        ++stddev_inv_shift;
-    }
-
-    const unsigned int max_left_shift_bits       = clz(input) - 1;
-    const unsigned int max_left_shift_bits_pairs = max_left_shift_bits / 2;
-    const unsigned int left_shift_bit_pairs      = max_left_shift_bits_pairs - 1;
-    stddev_inv_shift -= left_shift_bit_pairs;
-    input <<= 2 * left_shift_bit_pairs;
-
-    typedef int               FixedPointRawType;
-    const unsigned int        fixedpoint_position     = 3;
-    const unsigned int        fixedpoint_int_position = sizeof(FixedPointRawType) * 8 - 1 - fixedpoint_position;
-    typedef FixedPointRawType FixedPoint3;
-    typedef FixedPointRawType FixedPoint0;
-
-    const FixedPoint3 fixedpoint_input      = (input >> 1);
-    const FixedPoint3 fixedpoint_half_input = ASYMM_ROUNDING_DIVIDE_BY_POW2(fixedpoint_input, 1, 1);
-    const FixedPoint3 fixedpoint_half_three = (0x1 << fixedpoint_int_position) + (0x1 << (fixedpoint_int_position - 1));
-    FixedPoint3       x                     = 0x1 << fixedpoint_int_position;
-
-    const int num_iteration = 5;
-    for(int i = 0; i < num_iteration; i++)
-    {
-        int x3 = ASYMM_RESCALE(ASYMM_MULT(ASYMM_MULT(x, x, 1), x, 1), 9, fixedpoint_position, 1);
-        x      = ASYMM_RESCALE(ASYMM_MULT(fixedpoint_half_three, x, 1) - ASYMM_MULT(fixedpoint_half_input, x3, 1), 6, fixedpoint_position, 1);
-    }
-    const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
-    x                                        = ASYMM_MULT(fixedpoint_half_sqrt_2, x, 1);
-    stddev_inv_multiplier                    = x;
-    if(stddev_inv_shift < 0)
-    {
-        stddev_inv_multiplier <<= -stddev_inv_shift;
-        stddev_inv_shift = 0;
-    }
-    stddev_inv_shift *= reverse_shift;
-
-    stddev_inv.s0 = stddev_inv_multiplier;
-    stddev_inv.s1 = stddev_inv_shift;
-    return stddev_inv;
-}
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
-/** This function implements QLSTM layer normalization.
- *
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Data type should be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Width of the input tensor should be passed using the -DWIDTH compile flag, e.g. -DWIDTH=16
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: QSYMM16
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  weight_ptr                           Pointer to the weight tensor. Supported data type: same as @p input_ptr
- * @param[in]  weight_stride_x                      Stride of the weight tensor in X dimension (in bytes)
- * @param[in]  weight_step_x                        weight_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  weight_offset_first_element_in_bytes The offset of the first element in the weight tensor
- * @param[in]  bias_ptr                             Pointer to the bias tensor. Supported data type: S32
- * @param[in]  bias_stride_x                        Stride of the bias tensor in X dimension (in bytes)
- * @param[in]  bias_step_x                          bias_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  bias_offset_first_element_in_bytes   The offset of the first element in the biases tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void qlstm_layer_normalization(
-    IMAGE_DECLARATION(input),
-    VECTOR_DECLARATION(weight),
-    VECTOR_DECLARATION(bias),
-    IMAGE_DECLARATION(output))
-{
-    // Get pixels pointer
-    Image  input  = CONVERT_TO_IMAGE_STRUCT(input);
-    Vector weight = CONVERT_TO_VECTOR_STRUCT(weight);
-    Vector bias   = CONVERT_TO_VECTOR_STRUCT(bias);
-    Image  output = CONVERT_TO_IMAGE_STRUCT(output);
-
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    sum = 0;
-    VEC_DATA_TYPE(long, VEC_SIZE)
-    sum_sq = 0;
-    // Calculate partial sum
-    int i = 0;
-    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
-    {
-        // Load data
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
-
-        sum += CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE));
-        sum_sq += CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE)) * CONVERT(data, VEC_DATA_TYPE(long, VEC_SIZE));
-    }
-    // Perform reduction
-    sum.s0    = PERFORM_REDUCTION(sum, int);
-    sum_sq.s0 = PERFORM_REDUCTION(sum_sq, long);
-
-    // Left-overs loop
-    for(; i < WIDTH; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
-
-        sum.s0 += CONVERT(data, int);
-        sum_sq.s0 += CONVERT(data, long) * CONVERT(data, long);
-    }
-
-    int  temp       = 0x100000 / WIDTH;
-    int  mean       = (int)(sum.s0 * 1024 / WIDTH);
-    int  var2       = ((sum_sq.s0 * (long)temp) - ((long)mean * (long)mean)) / 0x100000;
-    int2 stddev_inv = get_invsqrt_quantized_multiplier_exp(var2, -1);
-
-    i = 0;
-    for(; i <= (WIDTH - VEC_SIZE); i += VEC_SIZE)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)offset(&input, i, 0));
-        VEC_DATA_TYPE(int, VEC_SIZE)
-        res = CONVERT(data, VEC_DATA_TYPE(int, VEC_SIZE)) * 1024 - mean;
-        res = multiply_by_quantized_multiplier(res, stddev_inv.s0, stddev_inv.s1);
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        w   = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)vector_offset(&weight, i));
-        res = res * CONVERT(w, VEC_DATA_TYPE(int, VEC_SIZE));
-        res = res + VLOAD(VEC_SIZE)(0, (__global int *)vector_offset(&bias, i));
-        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
-        res = (res + 512) >> 10;
-        res = multiply_by_quantized_multiplier(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12);
-#if defined(MIN_BOUND)
-        res = max(res, (VEC_DATA_TYPE(int, VEC_SIZE))MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-        res = min(res, (VEC_DATA_TYPE(int, VEC_SIZE))MAX_BOUND);
-#endif // defined(MAX_BOUND)
-        VSTORE(VEC_SIZE)
-        (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)offset(&output, i, 0));
-    }
-    for(; i < WIDTH; ++i)
-    {
-        DATA_TYPE data = *((__global DATA_TYPE *)offset(&input, i, 0));
-        int res        = (int)data * 1024 - mean;
-        res            = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, stddev_inv.s0, stddev_inv.s1, 1);
-        DATA_TYPE w    = *((__global DATA_TYPE *)vector_offset(&weight, i));
-        res            = res * (int)w;
-        int b          = *((__global int *)vector_offset(&bias, i));
-        res            = res + b;
-        // Due to different rounding scheme, we might need to revisit in the future: res = select(res - 512, res + 512, res > 0) / 1024;
-        res = (res + 512) >> 10;
-        res = MULTIPLY_BY_QUANTIZED_MULTIPLIER(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT + 12, 1);
-#if defined(MIN_BOUND)
-        res = max(res, MIN_BOUND);
-#endif // defined(MIN_BOUND)
-#if defined(MAX_BOUND)
-        res = min(res, MAX_BOUND);
-#endif // defined(MAX_BOUND)
-        *((__global DATA_TYPE *)offset(&output, i, 0)) = (DATA_TYPE)res;
-    }
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(WIDTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/quantization_layer.cl
deleted file mode 100644
index 3538dae5f0..0000000000
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_RTE_VEC_STR(x, type, size) (convert_##type##size##_rte((x)))
-#define CONVERT_RTE_VEC(x, type, size) CONVERT_RTE_VEC_STR(x, type, size)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
-
-/** This performs the quantization of floating point inputs or 8-bit quantized integers to 8-bit integers.
- *
- * @note Input data type should be given as a preprocessor argument using -DDATA_TYPE_IN=type. e.g. -DDATA_TYPE=short
- * @note Output data type should be given as a preprocessor argument using -DDATA_TYPE_OUT=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Quantization scale should be given as a preprocessor argument using -DSCALE=scale. e.g. -DSCALE=0.125
- * @note Quantization offset should be given as a preprocessor argument using -DOFFSET=offset. e.g. -DOFFSET=125
- * @note Minimum value for quantized type should be given as a preprocessor argument using -DMIN_QUANT_VAL=value. e.g. -DMIN_QUANT_VAL=0
- * @note Maximum value for quantized type should be given as a preprocessor argument using -DMAX_QUANT_VAL=value. e.g. -DMAXIN_QUANT_VAL=255
- * @note If the input data type if a floating point (F16 or F32) the preprocessor argument should be give as -DIS_FLOAT
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/F16/F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void quantization_layer(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
-    // Load data
-#if defined(IS_FLOAT)
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-    val_float = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-
-    // Create scale and offset vectors
-    const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
-    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
-#else // defined(IS_FLOAT)
-    // Load data
-    VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-
-    const VEC_DATA_TYPE(float, VEC_SIZE)
-    val_float = CONVERT(val, VEC_DATA_TYPE(float, VEC_SIZE));
-
-    // Create scale and offset vectors
-    const VEC_DATA_TYPE(float, VEC_SIZE) vscale = SCALE;
-    const VEC_DATA_TYPE(int, VEC_SIZE) voffset  = OFFSET;
-#endif // defined(IS_FLOAT)
-
-    // Quantize
-    VEC_DATA_TYPE(int, VEC_SIZE)
-    res = CLAMP(CONVERT_RTE_VEC(val_float / vscale, int, VEC_SIZE) + voffset, MIN_QUANT_VAL, MAX_QUANT_VAL);
-
-    // Store result
-    VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0, (__global DATA_TYPE_OUT *)output.ptr);
-#else  //!defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE_OUT *)(output.ptr)) = (DATA_TYPE_OUT)CLAMP(CONVERT_RTE(((float) * (__global DATA_TYPE_IN *)input.ptr) / ((float)SCALE), int) + (int)OFFSET, MIN_QUANT_VAL, MAX_QUANT_VAL);
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT) && defined(SCALE) && defined(OFFSET) && defined(MIN_QUANT_VAL) && defined(MAX_QUANT_VAL)
diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/range.cl
deleted file mode 100644
index d25d10e207..0000000000
--- a/src/core/CL/cl_kernels/range.cl
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
-
-#if !defined(OFFSET_OUT) && !defined(SCALE_OUT)
-
-#if VECTOR_SIZE == 2
-#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 2))(0, STEP))
-#elif VECTOR_SIZE == 3
-#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 3))(0, STEP, 2 * STEP))
-#elif VECTOR_SIZE == 4
-#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 4))(0, STEP, 2 * STEP, 3 * STEP))
-#elif VECTOR_SIZE == 8
-#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
-#elif VECTOR_SIZE == 16
-#define STEP_VEC ((VEC_DATA_TYPE(DATA_TYPE, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
-#endif // VECTOR_SIZE == 2
-
-/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
- *
- * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
- * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=value. e.g. -DDATA_TYPE=4
- * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- *
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: U8/S8/U16/S16/U32/S32/F16/F32.
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void range(
-    VECTOR_DECLARATION(out))
-{
-    uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
-    __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
-#if VECTOR_SIZE == 1
-    DATA_TYPE seq;
-    seq = (DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP;
-
-    *(__global DATA_TYPE *)dst_ptr = seq;
-#else  // VECTOR_SIZE == 1
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    seq0 = ((DATA_TYPE)START + (DATA_TYPE)id * (DATA_TYPE)STEP);
-    seq0 = seq0 + STEP_VEC;
-    STORE_VECTOR_SELECT(seq, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif //VECTOR_SIZE == 1
-}
-
-#else // !defined(OFFSET_OUT) && !defined(SCALE_OUT)
-
-#if VECTOR_SIZE == 2
-#define STEP_VEC ((VEC_DATA_TYPE(float, 2))(0, STEP))
-#elif VECTOR_SIZE == 3
-#define STEP_VEC ((VEC_DATA_TYPE(float, 3))(0, STEP, 2 * STEP))
-#elif VECTOR_SIZE == 4
-#define STEP_VEC ((VEC_DATA_TYPE(float, 4))(0, STEP, 2 * STEP, 3 * STEP))
-#elif VECTOR_SIZE == 8
-#define STEP_VEC ((VEC_DATA_TYPE(float, 8))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP))
-#elif VECTOR_SIZE == 16
-#define STEP_VEC ((VEC_DATA_TYPE(float, 16))(0, STEP, 2 * STEP, 3 * STEP, 4 * STEP, 5 * STEP, 6 * STEP, 7 * STEP, 8 * STEP, 9 * STEP, 10 * STEP, 11 * STEP, 12 * STEP, 13 * STEP, 14 * STEP, 15 * STEP))
-#endif // VECTOR_SIZE == 2
-
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-
-/** Generates a sequence of numbers starting from START and extends by increments of 'STEP' up to but not including 'END'.
- *
- * @note starting value of the sequence must be given as a preprocessor argument using -DSTART=value. e.g. -DSTART=0
- * @note difference between consequtive elements of the sequence must be given as a preprocessor argument using -DSTEP=value. e.g. -DSTEP=1
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note vector size supported by the device must be given as a preprocessor argument using -DVECTOR_SIZE=vector_size. e.g. -DDATA_TYPE=4
- * @note The quantization offset of the output must be passed at compile time using -DOFFSET_OUT, i.e. -DOFFSET_OUT=10
- * @note The quantization scale of the output must be passed at compile time using -DSCALE_OUT, i.e. -DSCALE_OUT=10
- *
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: QASYMM8.
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void range_quantized(
-    VECTOR_DECLARATION(out))
-{
-    uint     id             = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VEC_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
-    __global uchar *dst_ptr = out_ptr + out_offset_first_element_in_bytes + id * sizeof(DATA_TYPE);
-#if VECTOR_SIZE == 1
-    float           seq;
-    seq                            = (float)START + (float)id * (float)STEP;
-    seq                            = (DATA_TYPE)(int)(seq / ((float)SCALE_OUT) + (float)OFFSET_OUT);
-    seq                            = max(0.0f, min(seq, 255.0f));
-    *(__global DATA_TYPE *)dst_ptr = CONVERT_SAT(CONVERT_DOWN(seq, int), DATA_TYPE);
-#else  // VECTOR_SIZE == 1
-    VEC_DATA_TYPE(float, VECTOR_SIZE)
-    seq = (float)START + id * (float)STEP;
-    seq = seq + STEP_VEC;
-    seq = seq / ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)SCALE_OUT)) + ((VEC_DATA_TYPE(float, VECTOR_SIZE))((float)OFFSET_OUT));
-    seq = max((VEC_DATA_TYPE(float, VECTOR_SIZE))(0.0f), min(seq, (VEC_DATA_TYPE(float, VECTOR_SIZE))(255.0f)));
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    res0 = CONVERT_SAT(CONVERT_DOWN(seq, VEC_DATA_TYPE(int, VECTOR_SIZE)), VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE, dst_ptr, VECTOR_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-#endif // VECTOR_SIZE == 1
-}
-#endif // !defined(OFFSET_OUT) && !defined(SCALE_OUT)
-
-#endif // defined(VECTOR_SIZE) && defined(START) && defined(STEP) && defined(DATA_TYPE) && defined(VEC_SIZE_LEFTOVER)
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
deleted file mode 100644
index 9f2c6e23b5..0000000000
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ /dev/null
@@ -1,460 +0,0 @@
-/*
- * Copyright (c) 2016-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "helpers_asymm.h"
-
-#if defined(FLOAT_DATA_TYPE)
-#define ISGREATER(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isgreater(x, y))
-#define ISLESS(x, y) (SELECT_VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE))(isless(x, y))
-#define ISGREATER_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isgreater(x, y))
-#define ISLESS_SCALAR(x, y) (SELECT_DATA_TYPE(DATA_TYPE_PROMOTED))(isless(x, y))
-#else // !FLOAT_DATA_TYPE
-#if defined(WIDTH)
-#define ISGREATER(x, y) (x > y) ? 1 : 0
-#define ISLESS(x, y) (x < y) ? 1 : 0
-#define ISGREATER_SCALAR ISGREATER
-#define ISLESS_SCALAR ISLESS
-#else // !defined(WIDTH)
-#define ISGREATER(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x > y)
-#define ISLESS(x, y) select((VEC_DATA_TYPE(int, VEC_SIZE))0, (VEC_DATA_TYPE(int, VEC_SIZE)) - 1, x < y)
-#endif // defined(WIDTH)
-#endif // defined(FLOAT_DATA_TYPE)
-
-#if defined(WIDTH)
-#if defined(OPERATION)
-
-#define sum(in0, in1, size) (in0 + SUM_REDUCE(in1, size))
-#define square_sum(in0, in1, size) (in0 + SUM_REDUCE((in1 * in1), size))
-#define product(in0, in1, size) (in0 * PROD_REDUCE(in1, size))
-
-/** This kernel performs parallel reduction given an operation on x-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
- * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
- * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
- * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input
- * @param[in] output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reduction_operation_x(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + y * input_stride_y + z * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + y * output_stride_y + z * output_stride_z;
-
-#if defined(PROD)
-    DATA_TYPE res = (DATA_TYPE)1;
-#else  // defined(PROD)
-    DATA_TYPE res = (DATA_TYPE)0;
-#endif // defined(PROD)
-
-    int x = 0;
-
-    for(; x <= (WIDTH - VEC_SIZE); x += VEC_SIZE)
-    {
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
-        res  = OPERATION(res, vals, VEC_SIZE);
-    }
-
-#if(WIDTH % VEC_SIZE)
-    _Pragma("unroll") for(; x < WIDTH; ++x)
-    {
-        DATA_TYPE val = *((__global DATA_TYPE *)(input_addr + x * sizeof(DATA_TYPE)));
-        res           = OPERATION(res, val, 1);
-    }
-#endif // (WIDTH % VEC_SIZE)
-
-#if defined(MEAN)
-    res /= WIDTH;
-#endif // defined(MEAN)
-    *((__global DATA_TYPE *)output_addr) = res;
-}
-#endif // defined(OPERATION)
-/** This kernel performs reduction on x-axis. (Non parallel)
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
- * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void reduction_operation_non_parallel_x(
-    VECTOR_DECLARATION(input),
-    VECTOR_DECLARATION(output))
-{
-    Vector input  = CONVERT_TO_VECTOR_STRUCT(input);
-    Vector output = CONVERT_TO_VECTOR_STRUCT(output);
-
-    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, 0)), DATA_TYPE_PROMOTED);
-
-    // Convert input into F32 in order to perform quantized multiplication
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    for(unsigned int x = 1; x < WIDTH; ++x)
-    {
-        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&input, x)), DATA_TYPE_PROMOTED);
-#if defined(MIN)
-        res = select(res, in, ISLESS_SCALAR(in, res));
-#elif defined(MAX)
-        res = select(res, in, ISGREATER_SCALAR(in, res));
-#elif defined(PROD)
-#if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
-#else  // !(defined(OFFSET) && defined(SCALE))
-        res *= in;
-#endif //  defined(OFFSET) && defined(SCALE)
-#else  // defined(SUM))
-        res += in;
-#endif // defined(MAX) || defined(MIN) || defined(PROD)
-    }
-
-    // Store result
-#if defined(MEAN)
-    res /= WIDTH;
-#endif // defined(MEAN)
-
-    // Subtract the offsets in case of quantized SUM
-#if defined(SUM) && defined(OFFSET) && defined(SCALE)
-    res -= (WIDTH - 1) * OFFSET;
-#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
-
-    // Re-quantize
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
-}
-#endif // defined(WIDTH)
-
-#if defined(HEIGHT)
-/** This kernel performs reduction on y-axis.
- *
- * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void reduction_operation_y(
-    IMAGE_DECLARATION(input),
-    IMAGE_DECLARATION(output))
-{
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y;
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-    // Convert input into F32 in order to perform quantized multiplication
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-#if defined(SUM_SQUARE)
-    res *= res;
-#endif // defined(SUM_SQUARE)
-
-    for(unsigned int y = 1; y < HEIGHT; ++y)
-    {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-#if defined(MIN)
-        res = select(res, in, ISLESS(in, res));
-#elif defined(MAX)
-        res = select(res, in, ISGREATER(in, res));
-#else // !(defined(MAX) || defined(MIN))
-#if defined(SUM_SQUARE)
-        in *= in;
-#endif // defined(SUM_SQUARE)
-#if defined(PROD)
-
-#if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#else  // !(defined(OFFSET) && defined(SCALE))
-        res *= in;
-#endif //  defined(OFFSET) && defined(SCALE)
-
-#else  // !defined(PROD)
-        res += in;
-#endif // defined(PROD)
-#endif // defined(MAX) || defined(MIN)
-    }
-
-#if defined(MEAN)
-    res /= HEIGHT;
-#endif // defined(MEAN)
-
-    // Subtract the offsets in case of quantized SUM
-#if defined(SUM) && defined(OFFSET) && defined(SCALE)
-    res -= (HEIGHT - 1) * OFFSET;
-#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
-
-    // Re-quantize
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    // Store result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif // defined(HEIGHT)
-
-#if defined(DEPTH)
-/** This kernel performs reduction on z-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void reduction_operation_z(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output))
-{
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + z * input_stride_z;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + z * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-    // Convert input into F32 in order to perform quantized multiplication
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-#if defined(SUM_SQUARE)
-    res *= res;
-#endif // defined(SUM_SQUARE)
-
-    for(unsigned int z = 1; z < DEPTH; ++z)
-    {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-#if defined(MIN)
-        res = select(res, in, ISLESS(in, res));
-#elif defined(MAX)
-        res = select(res, in, ISGREATER(in, res));
-#else // !(defined(MAX) || defined(MIN))
-#if defined(SUM_SQUARE)
-        in *= in;
-#endif // defined(SUM_SQUARE)
-#if defined(PROD)
-
-#if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#else  // !(defined(OFFSET) && defined(SCALE))
-        res *= in;
-#endif //  defined(OFFSET) && defined(SCALE)
-
-#else  // !defined(PROD)
-        res += in;
-#endif // defined(PROD)
-#endif // defined(MAX) || defined(MIN)
-    }
-
-#if defined(MEAN)
-    res /= DEPTH;
-#endif // defined(MEAN)
-
-    // Subtract the offsets in case of quantized SUM
-#if defined(SUM) && defined(OFFSET) && defined(SCALE)
-    res -= (DEPTH - 1) * OFFSET;
-#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
-
-    // Re-quantize
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    // Store result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DEPTH) */
-
-#if defined(BATCH) && defined(DEPTH)
-/** This kernel performs reduction on w-axis.
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
- * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
- *
- * @param[in] input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
- * @param[in] input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in] output_ptr                           The local buffer to hold sumed values. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x                      Stride of the output tensor in X dimension (in bytes)
- * @param[in] output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y                      Stride of the output tensor in Y dimension (in bytes)
- * @param[in] output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z                      Stride of the output tensor in Z dimension (in bytes)
- * @param[in] output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w                      Stride of the output tensor in W dimension (in bytes)
- * @param[in] output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
- */
-__kernel void reduction_operation_w(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    int x = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    int y = get_global_id(1);
-    int z = get_global_id(2);
-
-    __global uchar *input_addr  = input_ptr + input_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * input_stride_y + (z % DEPTH) * input_stride_z + (z / DEPTH) * input_stride_w;
-    __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * output_stride_y + (z % DEPTH) * output_stride_z + (z / DEPTH) * output_stride_z;
-
-    VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-    res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-    // Convert input into F32 in order to perform quantized multiplication
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    VEC_DATA_TYPE(float, VEC_SIZE)
-    res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-#if defined(SUM_SQUARE)
-    res *= res;
-#endif // defined(SUM_SQUARE)
-
-    for(unsigned int w = 1; w < BATCH; ++w)
-    {
-        VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE)
-        in = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + w * input_stride_w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, VEC_SIZE));
-
-#if defined(MIN)
-        res = select(res, in, ISLESS(in, res));
-#elif defined(MAX)
-        res = select(res, in, ISGREATER(in, res));
-#else // !(defined(MAX) || defined(MIN))
-#if defined(SUM_SQUARE)
-        in *= in;
-#endif // defined(SUM_SQUARE)
-#if defined(PROD)
-
-#if defined(OFFSET) && defined(SCALE)
-        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#else  // !(defined(OFFSET) && defined(SCALE))
-        res *= in;
-#endif //  defined(OFFSET) && defined(SCALE)
-
-#else  // !defined(PROD)
-        res += in;
-#endif //defined(PROD)
-#endif // defined(MAX) || defined(MIN)
-    }
-
-#if defined(MEAN)
-    res /= BATCH;
-#endif // defined(MEAN)
-
-    // Subtract the offsets in case of quantized SUM
-#if defined(SUM) && defined(OFFSET) && defined(SCALE)
-    res -= (BATCH - 1) * OFFSET;
-#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
-
-    // Re-quantize
-#if defined(PROD) && defined(OFFSET) && defined(SCALE)
-    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, VEC_SIZE);
-#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
-
-    // Store result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/remap.cl
deleted file mode 100644
index cb67c2df1e..0000000000
--- a/src/core/CL/cl_kernels/remap.cl
+++ /dev/null
@@ -1,286 +0,0 @@
-/*
- * Copyright (c) 2017, 2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-#ifndef DEPTH_OUT
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_nearest_neighbour_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- */
-__kernel void remap_bilinear_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    IMAGE_DECLARATION(mapx),
-    IMAGE_DECLARATION(mapy),
-    const float width,
-    const float height)
-{
-    Image in   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image out  = CONVERT_TO_IMAGE_STRUCT(out);
-    Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
-    Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
-    float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
-    float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
-    float8 map_coords  = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
-                                  mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
-    vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
-}
-#else // DEPTH_OUT
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8,F16.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8,F16.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- * @param[in]  border_val                         Value to use for border around input tensor when in CONSTANT border is selected
- */
-__kernel void remap_nearest_neighbour_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    TENSOR4D_DECLARATION(mapx),
-    TENSOR4D_DECLARATION(mapy),
-    const float width,
-    const float height
-#ifdef CONSTANT_BORDER
-    ,
-    const DATA_TYPE border_val
-#endif // CONSTANT_BORDER
-)
-{
-    Tensor4D in   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out  = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-    Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT);
-    Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT);
-
-    float mapx_coord = (float) * (__global float *)mapx.ptr;
-    float mapy_coord = (float) * (__global float *)mapy.ptr;
-
-#ifdef CONSTANT_BORDER
-    if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1)
-    {
-        *((__global DATA_TYPE *)out.ptr) = border_val;
-        return;
-    }
-#else  // CONSTANT_BORDER
-    mapx_coord = clamp(mapx_coord, 0.0f, width - 1);
-    mapy_coord = clamp(mapy_coord, 0.0f, height - 1);
-#endif // CONSTANT_BORDER
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(mapx_coord), convert_int(mapy_coord), (get_global_id(2) / DEPTH_OUT)));
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *  Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- *     out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in]  in_ptr                             Pointer to the source image. Supported data types: U8,F16.
- * @param[in]  in_stride_x                        Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                          in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  in_stride_y                        Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                          in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  in_offset_first_element_in_bytes   Offset of the first element in the source image
- * @param[out] out_ptr                            Pointer to the destination image. Supported data types: U8,F16.
- * @param[in]  out_stride_x                       Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                         out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  out_stride_y                       Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                         out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  out_offset_first_element_in_bytes  Offset of the first element in the destination image
- * @param[in]  mapx_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapx_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapx_step_x                        mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapx_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapx_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  mapy_ptr                           Pointer to the x remapping image. Supported data types: F32.
- * @param[in]  mapy_stride_x                      Stride of the remapping image in X dimension (in bytes)
- * @param[in]  mapy_step_x                        mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in]  mapy_stride_y                      Stride of the remapping image in Y dimension (in bytes)
- * @param[in]  mapy_step_y                        mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in]  mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in]  width                              Width of the input image
- * @param[in]  height                             Height of the input image
- * @param[in]  border_val                         Value to use for border around input tensor when in CONSTANT border is selected
- */
-__kernel void remap_bilinear_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    TENSOR4D_DECLARATION(mapx),
-    TENSOR4D_DECLARATION(mapy),
-    const float width,
-    const float height
-#ifdef CONSTANT_BORDER
-    ,
-    const DATA_TYPE border_val
-#endif // CONSTANT_BORDER
-)
-{
-    Tensor4D in   = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out  = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-    Tensor4D mapx = CONVERT_TO_TENSOR4D_STRUCT(mapx, DEPTH_OUT);
-    Tensor4D mapy = CONVERT_TO_TENSOR4D_STRUCT(mapy, DEPTH_OUT);
-
-    float mapx_coord = (float) * (__global float *)mapx.ptr;
-    float mapy_coord = (float) * (__global float *)mapy.ptr;
-
-#ifdef CONSTANT_BORDER
-    if(mapx_coord < 0 || mapx_coord > width - 1 || mapy_coord < 0 || mapy_coord > height - 1)
-    {
-        *((__global DATA_TYPE *)out.ptr) = border_val;
-        return;
-    }
-#endif // CONSTANT_BORDER
-
-    const float new_xf     = floor(mapx_coord);
-    const float new_yf     = floor(mapy_coord);
-    const float clamped_x  = clamp(new_xf, 0.0f, width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, height - 1);
-
-    float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                          *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-
-    const float a  = mapx_coord - new_xf;
-    const float b  = 1.f - a;
-    const float a1 = mapy_coord - new_yf;
-    const float b1 = 1.f - a1;
-    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
-
-    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
-}
-
-#endif // DEPTH_OUT
diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/reorg_layer.cl
deleted file mode 100644
index 29344de37a..0000000000
--- a/src/core/CL/cl_kernels/reorg_layer.cl
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
-
-#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi)     \
-    ({                                                        \
-        int offset = zo / (int)SRC_DEPTH;                     \
-        xi         = xo * (int)STRIDE + offset % (int)STRIDE; \
-        yi         = yo * (int)STRIDE + offset / (int)STRIDE; \
-        zi         = zo % SRC_DEPTH;                          \
-    })
-
-/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NCHW
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
- * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reorg_layer_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int xo = get_global_id(0);
-    int yo = get_global_id(1);
-    int zo = get_global_id(2);
-    int xi, yi, zi;
-
-    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
-
-    int src_offset                   = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
-}
-
-/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
- * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reorg_layer_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-    int xo = get_global_id(1);
-    int yo = get_global_id(2);
-    int zo = get_global_id(0);
-    int xi, yi, zi;
-
-    CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
-
-    int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
-}
-#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/reshape_layer.cl
deleted file mode 100644
index 2d6a7edade..0000000000
--- a/src/core/CL/cl_kernels/reshape_layer.cl
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Perform tensor reshape
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- *
- * @param[in]  input_ptr                            Pointer to the first source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  input_shape                          Input spatial shape
- * @param[in]  output_shape                         Output spatial shape
- */
-__kernel void reshape_layer(TENSOR3D_DECLARATION(input),
-                            TENSOR3D_DECLARATION(output),
-                            int2 input_shape,
-                            int2 output_shape)
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-    int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
-
-    // Linearize index
-    int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
-
-    // Translate to output
-    int3 out_id;
-    out_id.x = linear_idx % output_shape.x;
-    out_id.y = (linear_idx / output_shape.x) % output_shape.y;
-    out_id.z = linear_idx / (output_shape.x * output_shape.y);
-
-    // Store result
-    *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) = *((__global DATA_TYPE *)in.ptr);
-}
diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/reverse.cl
deleted file mode 100644
index 10ffe84aeb..0000000000
--- a/src/core/CL/cl_kernels/reverse.cl
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
-* Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
-
-#if NUM_REVERSE_DIMS > 4
-#error("Reversing more than 4 dimensions is not currently supported")
-#endif /* NUM_REVERSE_DIMS > 4 */
-
-/** Performs reverse along the specified axis.
- *
- * @note The data type must be given as a preprocessor argument using -DDATA_TYPE=num. e.g. -DDATA_TYPE=uint
- * @note The number of dimensions to reverse must be given as a preprocessor argument using -DNUM_REVERSE_DIMS=num, e.g. -DNUM_REVERSE_DIMS=3
- *
- * @param[in]  src_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                       Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the first source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                       Stride of the first source tensor in Z dimension (in bytes)
- * @param[in]  src_step_w                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  axis_ptr                           Pointer to the source vector. Supported data types: U32
- * @param[in]  axis_stride_x                      Stride of the first source tensor in X dimension (in bytes)
- * @param[in]  axis_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  axis_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_w                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- */
-__kernel void reverse(TENSOR4D_DECLARATION(src),
-                      VECTOR_DECLARATION(axis),
-                      TENSOR4D_DECLARATION(dst),
-                      const uint width,
-                      const uint height,
-                      const uint depth,
-                      const uint batches)
-{
-    Tensor4D src  = CONVERT_TO_TENSOR4D_STRUCT(src, depth);
-    Vector   axis = CONVERT_TO_VECTOR_STRUCT_NO_STEP(axis);
-    Tensor4D dst  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, depth);
-
-    const uint x_in = get_global_id(0);
-    const uint y_in = get_global_id(1);
-    const uint z_in = get_global_id(2) % depth;
-    const uint w_in = get_global_id(2) / depth;
-
-    const uint4 dims       = (uint4)(0, 1, 2, 3);
-    int4        to_reverse = (int4)(0, 0, 0, 0);
-#if NUM_REVERSE_DIMS == 1
-    const uint index = *((__global uint *)axis.ptr);
-    to_reverse       = (uint4)index == dims;
-#elif NUM_REVERSE_DIMS == 2
-    const uint2 indices = vload2(0, (__global uint *)axis.ptr);
-    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims);
-#elif NUM_REVERSE_DIMS == 3
-    const uint2 indices01 = vload2(0, (__global uint *)axis.ptr);
-    const uint index2     = *((__global uint *)axis.ptr + 2);
-    to_reverse            = ((uint4)indices01.s0 == dims) || ((uint4)indices01.s1 == dims) || ((uint4)index2 == dims);
-#else  /* NUM_REVERSE_DIMS == 3 */
-    const uint4 indices = vload4(0, (__global uint *)axis.ptr);
-    to_reverse          = ((uint4)indices.s0 == dims) || ((uint4)indices.s1 == dims) || ((uint4)indices.s2 == dims) || ((uint4)indices.s3 == dims);
-#endif /* NUM_REVERSE_DIMS == 1 */
-    const uint x_out = to_reverse.s0 ? width - x_in - 1 : x_in;
-    const uint y_out = to_reverse.s1 ? height - y_in - 1 : y_in;
-    const uint z_out = to_reverse.s2 ? depth - z_in - 1 : z_in;
-    const uint w_out = to_reverse.s3 ? batches - w_in - 1 : w_in;
-
-    *((__global DATA_TYPE *)tensor4D_offset(&dst, x_out, y_out, z_out, w_out)) = *((__global DATA_TYPE *)src.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(NUM_REVERSE_DIMS)
diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/roi_align_layer.cl
deleted file mode 100644
index e0b98e68c9..0000000000
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
-// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
-#define EPS_GRID 0.00001f
-
-#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) // Check for compile time constants
-
-/** Performs a roi align on a single output pixel.
- *
- * @param[in] input          Pointer to input Tensor3D struct.
- * @param[in] region_start_x Start x index projected onto the input tensor.
- * @param[in] region_end_x   End x index projected onto the input tensor.
- * @param[in] region_start_y Start y index projected onto the input tensor.
- * @param[in] region_end_y   End y index projected onto the input tensor.
- * @param[in] pz             z index of the input tensor.
- *
- * @return An average pooled value from the region specified in the input tensor.
- */
-inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
-                               float bin_size_x,
-                               float grid_size_x,
-                               float region_end_x,
-                               float region_start_y,
-                               float bin_size_y,
-                               float grid_size_y,
-                               float region_end_y,
-                               int   pz)
-{
-    // Iterate through the pooling region
-    float sum = 0;
-    for(int iy = 0; iy < grid_size_y; ++iy)
-    {
-        for(int ix = 0; ix < grid_size_x; ++ix)
-        {
-            // Align the window in the middle of every bin
-            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
-            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
-
-            // Interpolation in the unit square
-            const int y_low  = (int)y;
-            const int x_low  = (int)x;
-            const int y_high = y_low + 1;
-            const int x_high = x_low + 1;
-
-            const float ly = y - y_low;
-            const float lx = x - x_low;
-            const float hy = 1.f - ly;
-            const float hx = 1.f - lx;
-
-            const float w1 = hy * hx;
-            const float w2 = hy * lx;
-            const float w3 = ly * hx;
-            const float w4 = ly * lx;
-#if defined(NHWC)
-            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
-            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
-            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
-            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
-#else  // !defined(NHWC)
-            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
-            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
-            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
-            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
-#endif // defined(NHWC)
-            sum += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
-        }
-    }
-
-    return (DATA_TYPE)(sum / (grid_size_x * grid_size_y));
-}
-
-/** Performs a roi align function.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32;
- * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
- * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
- * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
- * @note Spatial scale must be passed using -DSPATIAL_SCALE;
- * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
- *       will have a default sampling ratio of roi_dims/pooling_dims
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: F16, F32
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
- * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
- * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void roi_align_layer(
-    TENSOR3D_DECLARATION(input),
-    IMAGE_DECLARATION(rois),
-    TENSOR3D_DECLARATION(output),
-    unsigned int input_stride_w, unsigned int output_stride_w)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-#if defined(NHWC)
-    const int px = get_global_id(1);
-    const int py = get_global_id(2);
-    const int pw = get_global_id(0);
-#else  // !defined(NHWC)
-    const int                                  px = get_global_id(0);
-    const int                                  py = get_global_id(1);
-    const int                                  pw = get_global_id(2);
-#endif // defined(NHWC)
-
-    // Load roi parameters
-    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
-    const ushort roi_batch = (ushort) * ((__global DATA_TYPE *)offset(&rois, 0, pw));
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    roi                 = vload4(0, (__global DATA_TYPE *)offset(&rois, 1, pw));
-    const float2 roi_anchor = convert_float2(roi.s01) * convert_float(SPATIAL_SCALE);
-    const float2 roi_dims   = fmax(convert_float2(roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
-
-    // Calculate pooled region start and end
-    const float2 spatial_indx     = (float2)(px, py);
-    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
-    const float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
-
-    const float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
-    float2       region_start = spatial_indx * bin_size + roi_anchor;
-    float2       region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
-
-    region_start = clamp(region_start, 0, max_spatial_dims);
-    region_end   = clamp(region_end, 0, max_spatial_dims);
-
-#if defined(SAMPLING_RATIO)
-    const float2 roi_bin_grid = SAMPLING_RATIO;
-#else  // !defined(SAMPLING_RATIO)
-    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
-    const float2   roi_bin_grid = ceil(bin_size - EPS_GRID);
-#endif // defined(SAMPLING_RATIO)
-
-    // Move input and output pointer across the fourth dimension
-    input.ptr += roi_batch * input_stride_w;
-    output.ptr += pw * output_stride_w;
-    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
-    {
-#if defined(NHWC)
-        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
-#else  // !defined(NHWC)
-        __global DATA_TYPE *_output_ptr  = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
-#endif // defined(NHWC)
-        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
-                                                         region_start.x,
-                                                         bin_size.x,
-                                                         roi_bin_grid.x,
-                                                         region_end.x,
-                                                         region_start.y,
-                                                         bin_size.y,
-                                                         roi_bin_grid.y,
-                                                         region_end.y, pz);
-    }
-}
-#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
deleted file mode 100644
index d5c9a0d9bf..0000000000
--- a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-// This specifies the value to shift the result of roi_dims / pooled_dims before ceiling.
-// It is close to the epsilon machine (for a floating point system, x and x+EPS are the same number).
-#define EPS_GRID 0.00001f
-
-#if defined(DATA_TYPE) && defined(POOLED_DIM_X) && defined(POOLED_DIM_Y) && defined(MAX_DIM_X) && defined(MAX_DIM_Y) && defined(MAX_DIM_Z) && defined(SPATIAL_SCALE) && defined(OFFSET_IN) && defined(OFFSET_OUT) && defined(SCALE_IN) && defined(SCALE_OUT) && defined(OFFSET_ROIS) && defined(SCALE_ROIS) // Check for compile time constants
-
-/** Performs a roi align on a single output pixel.
- *
- * @param[in] input          Pointer to input Tensor3D struct.
- * @param[in] region_start_x Start x index projected onto the input tensor.
- * @param[in] region_end_x   End x index projected onto the input tensor.
- * @param[in] region_start_y Start y index projected onto the input tensor.
- * @param[in] region_end_y   End y index projected onto the input tensor.
- * @param[in] pz             z index of the input tensor.
- *
- * @return An average pooled value from the region specified in the input tensor.
- */
-inline DATA_TYPE roi_align_1x1(const Tensor3D *input, float region_start_x,
-                               float bin_size_x,
-                               float grid_size_x,
-                               float region_end_x,
-                               float region_start_y,
-                               float bin_size_y,
-                               float grid_size_y,
-                               float region_end_y,
-                               int   pz)
-{
-    // Iterate through the pooling region
-    float sum = 0;
-    for(int iy = 0; iy < grid_size_y; ++iy)
-    {
-        for(int ix = 0; ix < grid_size_x; ++ix)
-        {
-            // Align the window in the middle of every bin
-            const float y = region_start_y + (iy + 0.5f) * bin_size_y / (float)grid_size_y;
-            const float x = region_start_x + (ix + 0.5f) * bin_size_x / (float)grid_size_x;
-
-            // Interpolation in the unit square
-            const int y_low  = (int)y;
-            const int x_low  = (int)x;
-            const int y_high = y_low + 1;
-            const int x_high = x_low + 1;
-
-            const float ly = y - y_low;
-            const float lx = x - x_low;
-            const float hy = 1.f - ly;
-            const float hx = 1.f - lx;
-
-            const float w1 = hy * hx;
-            const float w2 = hy * lx;
-            const float w3 = ly * hx;
-            const float w4 = ly * lx;
-#if defined(NHWC)
-            const DATA_TYPE data1 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_low);
-            const DATA_TYPE data2 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_low);
-            const DATA_TYPE data3 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_low, y_high);
-            const DATA_TYPE data4 = *(__global DATA_TYPE *)tensor3D_offset(input, pz, x_high, y_high);
-#else  // !defined(NHWC)
-            const DATA_TYPE data1                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_low, pz);
-            const DATA_TYPE data2                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_low, pz);
-            const DATA_TYPE data3                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_low, y_high, pz);
-            const DATA_TYPE data4                 = *(__global DATA_TYPE *)tensor3D_offset(input, x_high, y_high, pz);
-#endif // defined(NHWC)
-
-            const float data1_f32 = DEQUANTIZE(data1, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
-            const float data2_f32 = DEQUANTIZE(data2, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
-            const float data3_f32 = DEQUANTIZE(data3, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
-            const float data4_f32 = DEQUANTIZE(data4, OFFSET_IN, SCALE_IN, DATA_TYPE, 1);
-            sum += w1 * data1_f32 + w2 * data2_f32 + w3 * data3_f32 + w4 * data4_f32;
-        }
-    }
-
-    const float res_f32 = sum / (grid_size_x * grid_size_y);
-    return QUANTIZE(res_f32, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
-}
-
-/** Performs a roi align function.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=uchar
- * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
- * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
- * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
- * @note Spatial scale must be passed using -DSPATIAL_SCALE;
- * @note Sampling ratio (i.e., the number of samples in each bin) may be passed using -DSAMPLING_RATIO. If not defined each roi
- *       will have a default sampling ratio of roi_dims/pooling_dims
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source tensor as specifed by ROI
- * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }.
- *                                                  Supported data types: QASYMM16 with 0.125f scale and 0 offset
- * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void roi_align_layer_quantized(
-    TENSOR3D_DECLARATION(input),
-    IMAGE_DECLARATION(rois),
-    TENSOR3D_DECLARATION(output),
-    unsigned int input_stride_w, unsigned int output_stride_w)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-#if defined(NHWC)
-    const int px = get_global_id(1);
-    const int py = get_global_id(2);
-    const int pw = get_global_id(0);
-#else  // !defined(NHWC)
-    const int                                  px = get_global_id(0);
-    const int                                  py = get_global_id(1);
-    const int                                  pw = get_global_id(2);
-#endif // defined(NHWC)
-
-    // Load roi parameters
-    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
-    const ushort roi_batch = *((__global ushort *)offset(&rois, 0, pw));
-    float4 roi             = DEQUANTIZE(vload4(0, (__global ushort *)offset(&rois, 1, pw)), OFFSET_ROIS, SCALE_ROIS, ushort, 4);
-    float2 roi_anchor      = roi.s01 * convert_float(SPATIAL_SCALE);
-    float2 roi_dims        = fmax((roi.s23 - roi.s01) * convert_float(SPATIAL_SCALE), 1.f);
-
-    // Calculate pooled region start and end
-    float2 spatial_indx     = (float2)(px, py);
-    float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
-    float2 max_spatial_dims = (float2)(MAX_DIM_X, MAX_DIM_Y);
-
-    float2 bin_size     = (float2)((roi_dims.s0 / (float)POOLED_DIM_X), (roi_dims.s1 / (float)POOLED_DIM_Y));
-    float2 region_start = spatial_indx * bin_size + roi_anchor;
-    float2 region_end   = (spatial_indx + 1) * bin_size + roi_anchor;
-
-    region_start = clamp(region_start, 0, max_spatial_dims);
-    region_end   = clamp(region_end, 0, max_spatial_dims);
-
-#if defined(SAMPLING_RATIO)
-    float2 roi_bin_grid = SAMPLING_RATIO;
-#else  // !defined(SAMPLING_RATIO)
-    // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
-    float2       roi_bin_grid           = ceil(bin_size - EPS_GRID);
-#endif // defined(SAMPLING_RATIO)
-
-    // Move input and output pointer across the fourth dimension
-    input.ptr += roi_batch * input_stride_w;
-    output.ptr += pw * output_stride_w;
-    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
-    {
-#if defined(NHWC)
-        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
-#else  // !defined(NHWC)
-        __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
-#endif // defined(NHWC)
-        *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
-                                                         region_start.x,
-                                                         bin_size.x,
-                                                         roi_bin_grid.x,
-                                                         region_end.x,
-                                                         region_start.y,
-                                                         bin_size.y,
-                                                         roi_bin_grid.y,
-                                                         region_end.y, pz);
-    }
-}
-#endif // Check for compile time constants
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/roi_pooling_layer.cl
deleted file mode 100644
index 6899b952e0..0000000000
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "helpers_asymm.h"
-
-#if DATA_SIZE == 32
-#define VEC_SIZE 4
-#define VEC_MAX vec4_max
-#elif DATA_SIZE == 16
-#define VEC_SIZE 8
-#define VEC_MAX vec8_max
-#elif DATA_SIZE == 8
-#define VEC_SIZE 16
-#define VEC_MAX vec16_max
-#else /* DATA_SIZE not equals 8, 16, 32 */
-#error "Unsupported data size"
-#endif /* DATA_SIZE == 32 */
-
-// Define whether to use max (Quantized datatype) or fmax (Float) functions
-#if defined(OFFSET_OUT) && defined(SCALE_OUT)
-#define MAX(x, y) max(x, y)
-#else // !(defined(OFFSET_OUT) && defined(SCALE_OUT)
-#define MAX(x, y) fmax(x, y)
-#endif // defined(OFFSET_OUT) && defined(SCALE_OUT)
-
-inline DATA_TYPE vec4_max(VEC_DATA_TYPE(DATA_TYPE, 4) vec)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    temp = MAX(vec.lo, vec.hi);
-    return MAX(temp.x, temp.y);
-}
-
-inline DATA_TYPE vec8_max(VEC_DATA_TYPE(DATA_TYPE, 8) vec)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    temp = MAX(vec.lo, vec.hi);
-    return vec4_max(temp);
-}
-
-inline DATA_TYPE vec16_max(VEC_DATA_TYPE(DATA_TYPE, 16) vec)
-{
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    temp = MAX(vec.lo, vec.hi);
-    return vec8_max(temp);
-}
-
-/** Performs a roi pooling on a single output pixel.
- *
- * @param[in] input          Pointer to input Tensor3D struct.
- * @param[in] region_start_x Start x index projected onto the input tensor.
- * @param[in] region_end_x   End x index projected onto the input tensor.
- * @param[in] region_start_y Start y index projected onto the input tensor.
- * @param[in] region_end_y   End y index projected onto the input tensor.
- * @param[in] pz             z index of the input tensor.
- *
- * @return A max pooled value from the region specified in the input tensor.
- */
-inline DATA_TYPE roi_pool_1x1(const Tensor3D *input, int region_start_x, int region_end_x, int region_start_y, int region_end_y, int pz)
-{
-    // Iterate through the pooling region
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
-    {
-        return (DATA_TYPE)0;
-    }
-    else
-    {
-        int num_iter = (int)((region_end_x - region_start_x) / VEC_SIZE);
-        VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-        curr_max = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(MIN_VALUE);
-
-        for(int j = region_start_y; j < region_end_y; ++j)
-        {
-            int i = region_start_x;
-            for(; i < region_start_x + num_iter * VEC_SIZE; i += VEC_SIZE)
-            {
-                VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-                val      = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(input, i, j, pz));
-                curr_max = MAX(val, curr_max);
-            }
-            for(; i < region_end_x; ++i)
-            {
-                DATA_TYPE val = *(__global DATA_TYPE *)tensor3D_offset(input, i, j, pz);
-                curr_max      = MAX(curr_max, val);
-            }
-        }
-
-        const DATA_TYPE temp = (DATA_TYPE)VEC_MAX(curr_max);
-
-#if defined(OFFSET_OUT) && defined(SCALE_OUT)
-        return QUANTIZE(temp, OFFSET_OUT, SCALE_OUT, DATA_TYPE, 1);
-#endif /* if quantized, requantize and return */
-
-        return temp;
-    }
-}
-
-/** Performs a roi pooling function.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16, F32, QASYMM8;
- * @note Datasize must be passed using -DDATA_SIZE e.g. -DDATA_SIZE=32;
- * @note Input dimensions must be passed using -DMAX_DIM_X, -DMAX_DIM_Y and -DMAX_DIM_Z;
- * @note Pooled region dimensions must be passed using -DPOOLED_DIM_X and -DPOOLED_DIM_Y;
- * @note Spatial scale must be passed using -DSPATIAL_SCALE;
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: F16, F32, QASYMM8
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the pooled region of the source image as specifed by ROI
- * @param[in]  rois_ptr                             Pointer to the ROIs tensor. Layout: { batch_index, x1, y1, x2, y2 }. Supported data types: same as @p input_ptr
- * @param[in]  rois_stride_x                        Stride of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_step_x                          Step of the ROIs tensor in X dimension (in bytes)
- * @param[in]  rois_stride_y                        Stride of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_step_y                          Step of the ROIs tensor in Y dimension (in bytes)
- * @param[in]  rois_offset_first_element_in_bytes   The offset of the first element in the ROIs tensor
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as input
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_stride_w                       Stride of the source image in W dimension (in bytes)
- * @param[in]  output_stride_w                      Stride of the destination image in W dimension (in bytes)
- */
-__kernel void roi_pooling_layer(
-    TENSOR3D_DECLARATION(input),
-    IMAGE_DECLARATION(rois),
-    TENSOR3D_DECLARATION(output),
-    unsigned int input_stride_w, unsigned int output_stride_w)
-{
-    // Get pixels pointer
-    Tensor3D input  = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input);
-    Image    rois   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(rois);
-    Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
-
-    const int px = get_global_id(0);
-    const int py = get_global_id(1);
-    const int pw = get_global_id(2);
-
-    // Load roi parameters
-    // roi is laid out as follows { batch_index, x1, y1, x2, y2 }
-    const ushort roi_batch = (ushort) * ((__global ushort *)offset(&rois, 0, pw));
-    const VEC_DATA_TYPE(ushort, 4)
-    roi               = vload4(0, (__global ushort *)offset(&rois, 1, pw));
-    const int2 roi_anchor = convert_int2_sat(round(convert_float2(roi.s01) * (float)SPATIAL_SCALE));
-    const int2 roi_dims   = convert_int2_sat(fmax(round(convert_float2(roi.s23 - roi.s01) * (float)SPATIAL_SCALE), 1.f));
-
-    // Calculate pooled region start and end
-    const float2 spatial_indx     = (float2)(px, py);
-    const float2 pooled_dims      = (float2)(POOLED_DIM_X, POOLED_DIM_Y);
-    const int2   max_spatial_dims = (int2)(MAX_DIM_X, MAX_DIM_Y);
-    int2         region_start     = convert_int2_sat(floor(spatial_indx / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
-    int2         region_end       = convert_int2_sat(floor((spatial_indx + 1) / pooled_dims * convert_float2(roi_dims))) + roi_anchor;
-
-    region_start = clamp(region_start, 0, max_spatial_dims);
-    region_end   = clamp(region_end, 0, max_spatial_dims);
-
-    // Move input and output pointer across the fourth dimension
-    input.ptr += roi_batch * input_stride_w;
-    output.ptr += pw * output_stride_w;
-
-    for(int pz = 0; pz < MAX_DIM_Z; ++pz)
-    {
-        *(__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz) = (__global DATA_TYPE)roi_pool_1x1(&input,
-                                                                                                       region_start.x,
-                                                                                                       region_end.x,
-                                                                                                       region_start.y,
-                                                                                                       region_end.y, pz);
-    }
-}
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
deleted file mode 100644
index d4c27e6cf6..0000000000
--- a/src/core/CL/cl_kernels/scale.cl
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * Copyright (c) 2016-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "warp_helpers.h"
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_nearest(const float2 coord, const float2 scale)
-{
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = in_x_coords * (float4)(scale.s0);
-    const float4 new_y       = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-    const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
-    const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_nearest_neighbour_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in          = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out         = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r           = (float2)(scale_x, scale_y);
-    float8       transformed = transform_nearest(get_current_coords(), r);
-#ifdef ALIGN_CORNERS
-    transformed = round(transformed);
-#endif // ALIGN_CORNERS
-    const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
-    vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8, S16.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear(get_current_coords(), r);
-    vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_nearest_neighbour_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    float new_x = get_global_id(1) * scale_x;
-    float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    float       new_x = (get_global_id(1) + 0.5f) * scale_x;
-    float       new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-#ifdef ALIGN_CORNERS
-    new_x = round(new_x);
-    new_y = round(new_y);
-#endif /* ALIGN_CORNERS */
-    const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
-    const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)));
-}
-
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
- *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: U8/S16/F16/F32.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- *
- */
-__kernel void scale_bilinear_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-
-    const float new_xf     = floor(new_x);
-    const float new_yf     = floor(new_y);
-    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
-
-#ifndef BORDER_MODE_REPLICATE
-    const bool  check_x = (0.f <= new_xf && new_xf < input_width);
-    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
-    const bool  check_y = (0.f <= new_yf && new_yf < input_height);
-    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
-    const float ins_0   = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                                          (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y);
-    const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y);
-    const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y1);
-    const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
-                                                                                                        (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y1);
-    float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3);
-#else  /* BORDER_MODE_REPLICATE */
-    float4 ins        = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                                 *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-#endif /* BORDER_MODE_REPLICATE */
-
-    const float a  = new_x - new_xf;
-    const float b  = 1.f - a;
-    const float a1 = new_y - new_yf;
-    const float b1 = 1.f - a1;
-    const float fr = ((ins.s0 * b * b1) + (ins.s1 * a * b1) + (ins.s2 * b * a1) + (ins.s3 * a * a1));
-
-    *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
-}
-#endif /* defined(DEPTH_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/scale_quantized.cl
deleted file mode 100644
index 010e4ed57a..0000000000
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-#include "warp_helpers_quantized.h"
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
-{
-    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float4 new_x = in_x_coords * (float4)(scale.s0);
-    const float4 new_y = (float4)(coord.s1 * scale.s1);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
-    const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
-    const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
-    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- */
-__kernel void scale_bilinear_quantized_nchw(
-    IMAGE_DECLARATION(in),
-    IMAGE_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Image        in  = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
-    Image        out = CONVERT_TO_IMAGE_STRUCT(out);
-    const float2 r   = (float2)(scale_x, scale_y);
-    const float8 tc  = transform_bilinear_quantized(get_current_coords_quantized(), r);
-    vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-#if defined(DEPTH_OUT)
-/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
- *
- * @note Sampling policy to be used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- * @note If border mode replicate is used, is should be passed as -DBORDER_MODE_REPLICATE
- * @note Output tensor's depth should be given as a preprocessor argument using -DDEPTH_OUT=size. e.g. -DDEPTH=16
- * @note The value to be used at the edges of the images shoud be given as a preprocessor argument using -DCONSTANT_VALUE=value.
- *
- * @param[in]  in_ptr                            Pointer to the source image. Supported data types: QASYMM8.
- * @param[in]  in_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  in_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  in_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source image in Z dimension (in bytes)
- * @param[in]  in_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] out_ptr                           Pointer to the destination image. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  out_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  out_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the destination image in Z dimension (in bytes)
- * @param[in]  out_step_z                        dst_stride_y * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  input_width                       Input image width
- * @param[in]  input_height                      Input image height
- * @param[in]  scale_x                           The scale factor along x dimension
- * @param[in]  scale_y                           The scale factor along y dimension
- * @param[in]  constant_border_value             Constant border value to use
- */
-__kernel void scale_bilinear_quantized_nhwc(
-    TENSOR4D_DECLARATION(in),
-    TENSOR4D_DECLARATION(out),
-    const float input_width,
-    const float input_height,
-    const float scale_x,
-    const float scale_y)
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
-
-#ifdef SAMPLING_POLICY_TOP_LEFT
-    const float new_x = get_global_id(1) * scale_x;
-    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
-#elif SAMPLING_POLICY_CENTER
-    const float new_x = (get_global_id(1) + 0.5f) * scale_x - 0.5f;
-    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y - 0.5f;
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-
-    const float new_xf     = floor(new_x);
-    const float new_yf     = floor(new_y);
-    const float clamped_x  = clamp(new_xf, 0.0f, input_width - 1);
-    const float clamped_x1 = clamp(new_xf + 1, 0.0f, input_width - 1);
-    const float clamped_y  = clamp(new_yf, 0.0f, input_height - 1);
-    const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
-
-#ifndef BORDER_MODE_REPLICATE
-    const bool  check_x = (0.f <= new_xf && new_xf < input_width);
-    const bool  check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
-    const bool  check_y = (0.f <= new_yf && new_yf < input_height);
-    const bool  check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
-    const int ins_0    = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
-                                                                                                     (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y);
-    const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y);
-    const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x && check_y1);
-    const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
-                                                                                                  (get_global_id(2) / DEPTH_OUT)))),
-                                 check_x1 && check_y1);
-    int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3);
-#else  /* BORDER_MODE_REPLICATE */
-    int4 ins          = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))),
-                               *((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT))));
-#endif /* BORDER_MODE_REPLICATE */
-
-    const float  a      = new_x - new_xf;
-    const float  b      = 1.f - a;
-    const float  a1     = new_y - new_yf;
-    const float  b1     = 1.f - a1;
-    const float4 insf32 = convert_float4(ins - (int4)OFFSET) * (float4)SCALE;
-
-    const float fr = ((insf32.s0 * b * b1) + (insf32.s1 * a * b1) + (insf32.s2 * b * a1) + (insf32.s3 * a * a1));
-
-    DATA_TYPE res = CONVERT_SAT(convert_int_sat_rtp(fr / SCALE) + OFFSET, DATA_TYPE);
-
-    *((__global DATA_TYPE *)out.ptr) = res;
-}
-#endif /* defined(DEPTH_OUT) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/select.cl
deleted file mode 100644
index 6fd4bd4ce3..0000000000
--- a/src/core/CL/cl_kernels/select.cl
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
-/** This function perform a select operation between two tensors when condition tensor has the same rank.
- *
- * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
- *
- * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
- * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  c_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  c_step_y                          c_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  c_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  c_step_z                          c_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
- * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
- * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void select_same_rank(
-    TENSOR3D_DECLARATION(c),
-    TENSOR3D_DECLARATION(x),
-    TENSOR3D_DECLARATION(y),
-    TENSOR3D_DECLARATION(out))
-{
-    // Get pointers
-    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes + offset + get_global_id(1) * c_step_y + get_global_id(2) * c_step_z;
-    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
-    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
-
-    // Load values
-    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_c = CONVERT(VLOAD(VEC_SIZE)(0, c_addr), SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
-
-    // Calculate result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-
-/** This function perform a select operation between two tensors when condition tensor has a different rank.
- *
- * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
- *
- * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
- * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
- * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
- * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void select_different_rank_2(
-    VECTOR_DECLARATION(c),
-    TENSOR3D_DECLARATION(x),
-    TENSOR3D_DECLARATION(y),
-    TENSOR3D_DECLARATION(out))
-{
-    const int c_idx = get_global_id(1);
-
-    // Get pointers
-    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes;
-    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
-    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
-
-    // Load values
-    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_c = *((__global uchar *)(c_addr + c_idx * c_stride_x));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
-
-    // Calculate result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) */
-
-#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) && defined(VEC_SIZE_LEFTOVER)
-/** This function perform a select operation between two tensors when condition tensor has a different rank.
- *
- * @attention The data_type need to be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @attention Leftover size in the X dimension should be given as preprocessor argument using -DVEC_SIZE_LEFTOVER=value: e.g. x_dimension % VEC_SIZE
- *
- * @param[in]  c_ptr                             Pointer to the source tensor. Supported data types: U8
- * @param[in]  c_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  c_step_x                          c_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  c_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  x_ptr                             Pointer to the source tensor. Supported data types: All
- * @param[in]  x_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  x_step_x                          x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  x_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  x_step_y                          x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  x_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  x_step_z                          x_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  x_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[in]  y_ptr                             Pointer to the source tensor. Supported data types: same as @p x_ptr
- * @param[in]  y_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  y_step_x                          y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  y_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  y_step_y                          y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  y_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  y_step_z                          y_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  y_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p x_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void select_different_rank_n(
-    VECTOR_DECLARATION(c),
-    TENSOR3D_DECLARATION(x),
-    TENSOR3D_DECLARATION(y),
-    TENSOR3D_DECLARATION(out))
-{
-    const int c_idx = get_global_id(2) / DEPTH_SIZE;
-
-    // Get pointers
-    uint     offset          = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
-    __global uchar *c_addr   = c_ptr + c_offset_first_element_in_bytes;
-    __global uchar *x_addr   = x_ptr + x_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * x_step_y + get_global_id(2) * x_step_z;
-    __global uchar *y_addr   = y_ptr + y_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * y_step_y + get_global_id(2) * y_step_z;
-    __global uchar *out_addr = out_ptr + out_offset_first_element_in_bytes + offset * sizeof(DATA_TYPE) + get_global_id(1) * out_step_y + get_global_id(2) * out_step_z;
-
-    // Load values
-    SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_c = *((__global uchar *)(c_addr + c_idx * c_stride_x));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_x = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)x_addr);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    in_y = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)y_addr);
-
-    // Calculate result
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    res0 = select(in_y, in_x, CONVERT(in_c > (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0, SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)));
-
-    // Boundary-aware store
-    STORE_VECTOR_SELECT(res, DATA_TYPE, (__global DATA_TYPE *)out_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
-}
-#endif /* defined(DATA_TYPE) && defined(VEC_SIZE) && defined(DEPTH_SIZE) && defined(VEC_SIZE_LEFTOVER) */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/slice_ops.cl
deleted file mode 100644
index dc3ffd91c1..0000000000
--- a/src/core/CL/cl_kernels/slice_ops.cl
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** Perform a strided slice operation on a given input.
- *
- * @attention Supported tensor rank: up to 4
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input and output tensor dephts should be given as a preprocessor arguments using -DSRC_DEPTH=size. and -DDST_DEPTH=size
- * @attention Absolute start coordinates for each dimension should be given as preprocessor -DSTART_index=value e.g. -DSTART_0=2
- * @attention Strides for each dimension should be given as preprocessor -DSTRIDE_index=value e.g. -DSTRIDE_1=1
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension (in bytes)
- * @param[in]  input_step_w                         input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void strided_slice(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    // Get pixels pointer
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-
-    int offset = 0;
-
-    // Offset X
-#if defined(SHRINK_0)
-    input.ptr += (int)START_0 * input_stride_x;
-#elif defined(START_0) && defined(STRIDE_0) && defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi = (int)(get_global_id(0) * VEC_SIZE);
-    offset       = (int)START_0 + min(xi, (int)LAST_ACCESSED_X);
-    input.ptr += offset * input_stride_x;
-    output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-#elif defined(START_0) && defined(STRIDE_0)
-    offset = (int)START_0 + (int)get_global_id(0) * (int)STRIDE_0;
-    input.ptr += offset * input_stride_x;
-#endif // defined(START_0) && defined(STRIDE_0)
-
-    // Offset Y
-#if defined(SHRINK_1)
-    input.ptr += (int)START_1 * input_stride_y;
-#elif defined(START_1) && defined(STRIDE_1)
-#if defined(SHRINK_0)
-    offset = (int)START_1 + (int)get_global_id(0) * (int)STRIDE_1;
-#else  // defined(SHRINK_0)
-    offset = (int)START_1 + (int)get_global_id(1) * (int)STRIDE_1;
-#endif // defined(SHRINK_0)
-    input.ptr += offset * input_stride_y;
-#endif // defined(START_1) && defined(STRIDE_1)
-
-    // Offset Z
-#if defined(SHRINK_2)
-    input.ptr += (int)START_2 * input_stride_z;
-#elif defined(START_2) && defined(STRIDE_2)
-
-#if defined(SHRINK_1) && defined(SHRINK_0)
-    offset = (int)START_2 + (int)get_global_id(0) * (int)STRIDE_2;
-#elif defined(SHRINK_1) || defined(SHRINK_0)
-    offset = (int)START_2 + (int)get_global_id(1) * (int)STRIDE_2;
-#else  // defined(SHRINK_1) && defined(SHRINK_0)
-    offset = (int)START_2 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_2;
-#endif // defined(SHRINK_1) && defined(SHRINK_0)
-
-    input.ptr += offset * input_stride_z;
-#endif // defined(START_2) && defined(STRIDE_2)
-
-    // Offset depth
-#if defined(SHRINK_3)
-    input.ptr += (int)START_3 * input_stride_w;
-#elif defined(START_3) && defined(STRIDE_3)
-#if defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
-    offset = (int)START_3 + (int)get_global_id(0) * (int)STRIDE_3;
-#elif !defined(SHRINK_2) && !defined(SHRINK_1) && !defined(SHRINK_0)
-    offset = (int)START_3 + ((int)get_global_id(2) / (int)DST_DEPTH) * (int)STRIDE_3;
-#elif(defined(SHRINK_0) && defined(SHRINK_1)) || (defined(SHRINK_1) && defined(SHRINK_2)) || (defined(SHRINK_0) && defined(SHRINK_2))
-    offset = (int)START_3 + (int)get_global_id(1) * (int)STRIDE_3;
-#else  // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
-    offset = (int)START_3 + ((int)get_global_id(2) % (int)DST_DEPTH) * (int)STRIDE_3;
-#endif // defined(SHRINK_2) && defined(SHRINK_1) && defined(SHRINK_0)
-    input.ptr += offset * input_stride_w;
-#endif // defined(START_3) && defined(STRIDE_3)
-
-    // Store result
-#if defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input.ptr));
-
-    VSTORE(VEC_SIZE)
-    (val, 0, (__global DATA_TYPE *)(output.ptr));
-#else  // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
-#endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
-}
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
deleted file mode 100644
index 7983734fc4..0000000000
--- a/src/core/CL/cl_kernels/sobel_filter.cl
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/***********************************************/
-/*   Begin implementation of Sobel3x3 filter   */
-/***********************************************/
-
-/** This OpenCL kernel that computes a Sobel3x3 filter.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel3x3(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    short8 gx = (short8)0;
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = (short8)0;
-#endif /* GRAD_Y */
-
-    // Row0
-    uchar16 temp   = vload16(0, offset(&src, -1, -1));
-    short8  left   = convert_short8(temp.s01234567);
-    short8  middle = convert_short8(temp.s12345678);
-    short8  right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-1);
-    gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(-1);
-    gy += middle * (short8)(-2);
-    gy += right * (short8)(-1);
-#endif /* GRAD_Y */
-
-    // Row1
-    temp  = vload16(0, offset(&src, -1, 0));
-    left  = convert_short8(temp.s01234567);
-    right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-2);
-    gx += right * (short8)(+2);
-#endif /* GRAD_X */
-
-    // Row2
-    temp   = vload16(0, offset(&src, -1, 1));
-    left   = convert_short8(temp.s01234567);
-    middle = convert_short8(temp.s12345678);
-    right  = convert_short8(temp.s23456789);
-#ifdef GRAD_X
-    gx += left * (short8)(-1);
-    gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    gy += left * (short8)(+1);
-    gy += middle * (short8)(+2);
-    gy += right * (short8)(+1);
-#endif /* GRAD_Y */
-
-    // Store results
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel3x3 filter   */
-/**********************************************/
-
-/***********************************************/
-/*   Begin implementation of Sobel5x5 filter   */
-/***********************************************/
-
-/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src             Pointer to source image.
- * @param[in] left1_coeff_gx  Weight of the most left pixel for gx
- * @param[in] left2_coeff_gx  Weight of the left pixel for gx
- * @param[in] middle_coeff_gx Weight of the middle pixel for gx
- * @param[in] right1_coeff_gx Weight of the right pixel for gx
- * @param[in] right2_coeff_gx Weight of the most right pixel for gx
- * @param[in] left1_coeff_gy  Weight of the most left pixel for gy
- * @param[in] left2_coeff_gy  Weight of the left pixel for gy
- * @param[in] middle_coeff_gy Weight of the middle pixel for gy
- * @param[in] right1_coeff_gy Weight of the right pixel for gy
- * @param[in] right2_coeff_gy Weight of the most right pixel for gy
- *
- * @return a short16 containing short8 gx and short8 gy values.
- */
-short16 sobel1x5(
-    Image      *src,
-    const short left1_coeff_gx,
-    const short left2_coeff_gx,
-    const short middle_coeff_gx,
-    const short right1_coeff_gx,
-    const short right2_coeff_gx,
-    const short left1_coeff_gy,
-    const short left2_coeff_gy,
-    const short middle_coeff_gy,
-    const short right1_coeff_gy,
-    const short right2_coeff_gy)
-{
-    uchar16 temp = vload16(0, offset(src, -2, 0));
-    short8  gx   = 0;
-    short8  gy   = 0;
-    short8  val;
-
-    val = convert_short8(temp.s01234567);
-    gx += val * (short8)left1_coeff_gx;
-    gy += val * (short8)left1_coeff_gy;
-
-    val = convert_short8(temp.s12345678);
-    gx += val * (short8)left2_coeff_gx;
-    gy += val * (short8)left2_coeff_gy;
-
-    val = convert_short8(temp.s23456789);
-    gx += val * (short8)middle_coeff_gx;
-    gy += val * (short8)middle_coeff_gy;
-
-    val = convert_short8(temp.s3456789a);
-    gx += val * (short8)right1_coeff_gx;
-    gy += val * (short8)right1_coeff_gy;
-
-    val = convert_short8(temp.s456789ab);
-    gx += val * (short8)right2_coeff_gx;
-    gy += val * (short8)right2_coeff_gy;
-
-    return (short16)(gx, gy);
-}
-
-/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src          Pointer to source image.
- * @param[in] up1_coeff    Weight of the most up pixel
- * @param[in] up2_coeff    Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff  Weight of the down pixel
- * @param[in] down2_coeff  Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-short8 sobel5x1(
-    Image      *src,
-    const short up1_coeff,
-    const short up2_coeff,
-    const short middle_coeff,
-    const short down1_coeff,
-    const short down2_coeff)
-{
-    short8 val;
-    short8 out = (short8)0;
-
-    val = vload8(0, (__global short *)offset(src, 0, -2));
-    out += val * (short8)up1_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, -1));
-    out += val * (short8)up2_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 0));
-    out += val * (short8)middle_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 1));
-    out += val * (short8)down1_coeff;
-
-    val = vload8(0, (__global short *)offset(src, 0, 2));
-    out += val * (short8)down2_coeff;
-
-    return (short8)(out);
-}
-
-/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image.. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image.. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x5(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-    short16 gx_gy = sobel1x5(&src,
-                             -1, -2, 0, 2, 1,
-                             1, 4, 6, 4, 1);
-
-    // Store result in dst
-#ifdef GRAD_X
-    vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
- *  and output two single channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_x_ptr                            Pointer to the source image.. Supported data types: S16
- * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S16
- * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable5x1(
-#ifdef GRAD_X
-    IMAGE_DECLARATION(src_x),
-    IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    IMAGE_DECLARATION(src_y),
-    IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
-    int dummy)
-{
-#ifdef GRAD_X
-    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-#ifdef GRAD_X
-    short8 gx = sobel5x1(&src_x,
-                         1, 4, 6, 4, 1);
-    vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    short8 gy = sobel5x1(&src_y,
-                         -1, -2, 0, 2, 1);
-    vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel5x5 filter   */
-/**********************************************/
-
-/***********************************************/
-/*   Begin implementation of Sobel7x7 filter   */
-/***********************************************/
-
-/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
-#define X0 -1
-#define X1 -4
-#define X2 -5
-#define X3 0
-#define X4 5
-#define X5 4
-#define X6 1
-
-/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
-#define Y0 1
-#define Y1 6
-#define Y2 15
-#define Y3 20
-#define Y4 15
-#define Y5 6
-#define Y6 1
-
-/* Calculates single horizontal iteration. */
-#define SOBEL1x1_HOR(src, gx, gy, idx)                               \
-    {                                                                \
-        int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
-        gx += val * X##idx;                                          \
-        gy += val * Y##idx;                                          \
-    }
-
-/* Calculates single vertical iteration. */
-#define SOBEL1x1_VERT(src, g, direction, idx)                          \
-    {                                                                  \
-        int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
-        g += val * (int8)direction##idx;                               \
-    }
-
-/* Calculates a 1x7 horizontal iteration. */
-#define SOBEL1x7(ptr, gx, gy)                        \
-    SOBEL1x1_HOR(ptr, gx, gy, 0)                     \
-    SOBEL1x1_HOR(ptr, gx, gy, 1)                 \
-    SOBEL1x1_HOR(ptr, gx, gy, 2)             \
-    SOBEL1x1_HOR(ptr, gx, gy, 3)         \
-    SOBEL1x1_HOR(ptr, gx, gy, 4)     \
-    SOBEL1x1_HOR(ptr, gx, gy, 5) \
-    SOBEL1x1_HOR(ptr, gx, gy, 6)
-
-/* Calculates a 7x1 vertical iteration. */
-#define SOBEL7x1(ptr, g, direction)                         \
-    SOBEL1x1_VERT(ptr, g, direction, 0)                     \
-    SOBEL1x1_VERT(ptr, g, direction, 1)                 \
-    SOBEL1x1_VERT(ptr, g, direction, 2)             \
-    SOBEL1x1_VERT(ptr, g, direction, 3)         \
-    SOBEL1x1_VERT(ptr, g, direction, 4)     \
-    SOBEL1x1_VERT(ptr, g, direction, 5) \
-    SOBEL1x1_VERT(ptr, g, direction, 6)
-
-/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_ptr                              Pointer to the source image. Supported data types: U8
- * @param[in]  src_stride_x                         Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                           src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                         Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                           src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S32
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S32
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x7(
-    IMAGE_DECLARATION(src)
-#ifdef GRAD_X
-    ,
-    IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    ,
-    IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
-    Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-    int8 gx = (int8)0;
-    int8 gy = (int8)0;
-
-    SOBEL1x7(&src, gx, gy);
-
-    // Store result in dst
-#ifdef GRAD_X
-    vstore8(gx, 0, ((__global int *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    vstore8(gy, 0, ((__global int *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in]  src_x_ptr                            Pointer to the source image. Supported data types: S32
- * @param[in]  src_x_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_x_step_x                         src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_x_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_x_step_y                         src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_x_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gx_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gx_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gx_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gx_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gx_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  src_y_ptr                            Pointer to the source image. Supported data types: S32
- * @param[in]  src_y_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  src_y_step_x                         src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_y_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_y_step_y                         src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_y_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] dst_gy_ptr                           Pointer to the destination image. Supported data types: S16
- * @param[in]  dst_gy_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_gy_step_x                        dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_gy_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_gy_step_y                        dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in]  dummy                                Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable7x1(
-#ifdef GRAD_X
-    IMAGE_DECLARATION(src_x),
-    IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    IMAGE_DECLARATION(src_y),
-    IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
-    int dummy)
-{
-#ifdef GRAD_X
-    Image src_x  = CONVERT_TO_IMAGE_STRUCT(src_x);
-    Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    Image src_y  = CONVERT_TO_IMAGE_STRUCT(src_y);
-    Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-    // Output pixels
-#ifdef GRAD_X
-    int8 gx = 0;
-    SOBEL7x1(&src_x, gx, Y);
-    vstore8(gx, 0, (__global int *)dst_gx.ptr);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
-    int8 gy = 0;
-    SOBEL7x1(&src_y, gy, X);
-    vstore8(gy, 0, (__global int *)dst_gy.ptr);
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/*    End implementation of Sobel7x7 filter   */
-/**********************************************/
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/softmax_layer.cl
deleted file mode 100644
index 4d2d89dd73..0000000000
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ /dev/null
@@ -1,531 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-    // Load max value of 1D logits vector (row)
-    DATA_TYPE sum_val = *((__global DATA_TYPE *)offset(&sum, 0, get_global_id(1)));
-    VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-    data0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-
-#if defined(LOG_SOFTMAX)
-    sum_val = log(sum_val);
-    data0 -= sum_val;
-#else  // defined(LOG_SOFTMAX)
-    data0 /= sum_val;
-#endif // defined(LOG_SOFTMAX)
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
-
-    // Initialize local maximum
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_TYPE data    = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    SELECT_TYPE widx = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-    max_val_vec      = max(max_val_vec, select((VEC_TYPE)(MINVAL), data, widx));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    // Perform max reduction
-    DATA_TYPE max_val                 = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_val;
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_TYPE sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    data -= max_val;
-#ifdef BETA
-    data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-    data = exp(data);
-    data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-    data = exp(data);
-    data = select(0, data, widx);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data, 0, (__global DATA_TYPE *)dst_addr);
-#endif /* LOG_SOFTMAX */
-    sum1D += data;
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    // Shift values, exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + i * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-
-    // Perform sum reduction
-    *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-}
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=0
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Beta can be optionally passed at compile time using -DBETA (by default, it is 1.0).
- * @note In case of log softmax, -DLOG_SOFTMAX must be passed.
- * @note Based on the data type, the minimum possible value must be passed using -DMINVAL. For float it should be defined as -FLT_MAX, while for half it should be -HALF_MAX
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE) * sizeof(DATA_TYPE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-#ifdef BETA
-    // Initialize beta
-    VEC_TYPE beta = (VEC_TYPE)BETA;
-#endif /* BETA */
-
-    // Define one temporary vector per work-item.
-    __local VEC_TYPE tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_TYPE max_val_vec = (VEC_TYPE)(MINVAL);
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    SELECT_TYPE widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_TYPE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (SELECT_TYPE)VECTOR_SIZE_LEFTOVER > VEC_OFFS(SELECT_DATA_TYPE(DATA_TYPE), VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select((VEC_TYPE)(MINVAL), data_max, widx));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = max_val_vec;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(tmp_local[lid + 1], tmp_local[lid]);
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_TYPE  sum1D   = 0;
-    DATA_TYPE max_val = max_local;
-
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        data = exp(data);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        VSTORE(VECTOR_SIZE)
-        (data, 0, (__global DATA_TYPE *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
-    {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_TYPE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data -= max_val;
-#ifdef BETA
-        data *= beta;
-#endif /* BETA */
-#ifdef LOG_SOFTMAX
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        data = exp(data);
-        data = select(0, data, widx);
-#else  /* LOG_SOFTMAX */
-        data = exp(data);
-        data = select(0, data, widx);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data, 0, (__global DATA_TYPE *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-#endif /* LOG_SOFTMAX */
-        sum1D += data;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global DATA_TYPE *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-    }
-}
-
-#endif // defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE) && defined(MINVAL)
-#endif // defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/softmax_layer_quantized.cl
deleted file mode 100644
index 4d5006d804..0000000000
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ /dev/null
@@ -1,530 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers_asymm.h"
-
-#if defined(DATA_TYPE) && defined(MIN_VALUE) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(DIFF_MIN)
-
-#define VEC_BASE VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VECTOR_SIZE)
-
-/** Divides all the values of the input tensor by the sum calculated from softmax_layer_shift_exp_sum kernel.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: S32
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void softmax_layer_norm_quantized(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(sum),
-    TENSOR3D_DECLARATION(dst))
-{
-    const int x_offs = max((int)(get_global_id(0) * VECTOR_SIZE - (VECTOR_SIZE - VECTOR_SIZE_LEFTOVER) % VECTOR_SIZE), 0);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image sum = CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(sum);
-
-    // Load max value of 1D logits vector (row)
-    int sum_val = *((__global int *)offset(&sum, 0, get_global_id(1)));
-
-    // It will be better to calculate this in prev layer and pass here as parameter
-    uint    sum_val_u               = convert_uint(sum_val);
-    int     headroom_plus_one       = clz(sum_val_u);
-    int     num_bits_over_unit      = EXP_ACCUMULATION_INT_BITS - headroom_plus_one;
-    int     shifted_sum_minus_one_1 = convert_int((sum_val_u << headroom_plus_one) - (1u << 31));
-    VEC_INT shifted_sum_minus_one   = shifted_sum_minus_one_1;
-    VEC_INT shifted_scale           = ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(shifted_sum_minus_one, VECTOR_SIZE);
-
-    // It was already calculated in prev layer, should be stored into tmp output and reused
-    VEC_INT data_diff      = VLOAD(VECTOR_SIZE)(0, (__global int *)src_addr);
-    VEC_INT data_diff_mult = data_diff;
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        data_diff_mult = ASYMM_MULT(data_diff * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-
-    VEC_INT data = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-    data         = ASYMM_MULT(shifted_scale, data, VECTOR_SIZE);
-    data         = ASYMM_ROUNDING_DIVIDE_BY_POW2(data, num_bits_over_unit + 31 - 8, VECTOR_SIZE);
-#ifdef QASYMM8_SIGNED
-    data += (VEC_INT)(MIN_VALUE);
-#endif /* QASYMM8_SIGNED */
-    data           = select(MIN_VALUE, data, data_diff >= (VEC_INT)(DIFF_MIN));
-    VEC_BASE data0 = CONVERT_SAT(data, VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE));
-
-    STORE_VECTOR_SELECT(data, DATA_TYPE, dst_addr, VECTOR_SIZE, VECTOR_SIZE_LEFTOVER, VECTOR_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-
-#if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-
-/* Number of workitems in dimension 0. */
-#if !defined(GRID_SIZE)
-#define GRID_SIZE 1
-#endif /* !defined(GRID_SIZE) */
-
-#define VEC_UINT VEC_DATA_TYPE(uint, VECTOR_SIZE)
-
-VEC_INT mult_by_quantized_multiplier(VEC_INT data)
-{
-#if defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT)
-    if(INPUT_BETA_MULTIPLIER > 1)
-    {
-        return ASYMM_MULT(data * (1 << INPUT_BETA_LEFT_SHIFT), INPUT_BETA_MULTIPLIER, VECTOR_SIZE);
-    }
-#endif /* defined(INPUT_BETA_MULTIPLIER) && defined(INPUT_BETA_LEFT_SHIFT) */
-    return data;
-}
-
-/** Shifts the values of the input tensor by the max calculated in softmax_layer_max kernel,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not multiple of VECTOR_SIZE -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor slice. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[in]  max_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  max_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  max_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  max_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  max_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  max_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  max_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  max_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor slice. Supported data types: S32
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[out] sum_ptr                           Pointer to the sum values tensor slice. Supported data types: same as @p dst_ptr
- * @param[in]  sum_stride_x                      Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                        sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                      Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                      Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                        sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_serial(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    VEC_BASE max_val_vec = (VEC_BASE)(MIN_VALUE);
-
-    // Calculate max of row
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE data        = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)src_addr);
-    VEC_INT widx         = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-    max_val_vec          = max(max_val_vec, select(vec_min_val, data, CONVERT(widx, VEC_BASE)));
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        max_val_vec   = max(data, max_val_vec);
-    }
-
-    // Perform max reduction
-    DATA_TYPE max_local               = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    *((__global DATA_TYPE *)maxo.ptr) = max_local;
-
-    // Second part
-
-    // Load max value of 1D logits vector (row)
-    int max_val = convert_int(max_local);
-
-    // Set sum vector, Q(EXP_ACCUMULATION_INT_BITS)
-    VEC_INT sum1D = 0;
-
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT data_fp        = CONVERT(data, VEC_INT);
-    VEC_INT data_diff      = data_fp - max_val;
-    VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-    data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-    data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-    VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-    (data_diff, 0, (__global int *)dst_addr);
-    data_fp = select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    sum1D += select(0, data_fp, widx);
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-
-    // Shift values, exp and sum
-    for(uint i = VECTOR_SIZE_LEFTOVER; i < SRC_WIDTH; i += VECTOR_SIZE)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + i * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + i * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-
-    // Perform sum reduction
-    *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-}
-
-/** Identifies the maximum value across the 1st dimension and shifts the values of the input tensor by this maximum value,
- * then gets the exponent of each element as sums all elements across each row.
- *
- * @note Datatype must be given as a preprocessor argument using -DDATA_TYPE, e.g. -DDATA_TYPE=uchar
- * @note The zero value for the given data type must be given as a preprocessor argument using -DMIN_VALUE, e.g. -DMIN_VALUE=-128
- * @note Vector size should be given as a preprocessor argument using -DVECTOR_SIZE=size. e.g. -DVECTOR_SIZE=16
- * @note Leftover vector size has to be passed at compile time using -DVECTOR_SIZE_LEFTOVER. e.g. -DVECTOR_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VECTOR_SIZE
- * @note In case the input is not a multiple of VECTOR_SIZE (2,4,8,16) -DNON_MULTIPLE_OF_VECTOR_SIZE must be passed.
- * @note Quantized beta can be optionally passed at compile time using -DINPUT_BETA_MULTIPLIER and -DINPUT_BETA_LEFT_SHIFT (if undefined, assume beta equals 1.0)
- * @note Additional quantization data must be passed at compile time using -DSCALED_DIFF_INT_BITS and -DEXP_ACCUMULATION_INT_BITS.
- * @note -DDIFF_MIN must be passed at compile time. It is threshold difference between maximum value of input data and current processed value, it defines whether the value will be taken into account or not.
- * @note In case the input's data type is QASYMM8_SIGNED, -DQASYMM8_SIGNED must be passed.
- *
- * @param[in]  src_ptr                            Pointer to the source tensor slice. Supported data types: F16/F32
- * @param[in]  src_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                         src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                         src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                         src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[in]  maxo_ptr                           Pointer to the max values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  maxo_stride_x                      Stride of the max values tensor in X dimension (in bytes)
- * @param[in]  maxo_step_x                        max_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  maxo_stride_y                      Stride of the max values tensor in Y dimension (in bytes)
- * @param[in]  maxo_step_y                        max_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  maxo_stride_z                      Stride of the max values tensor in Z dimension (in bytes)
- * @param[in]  maxo_step_z                        max_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  maxo_offset_first_element_in_bytes The offset of the first element in the max values tensor
- * @param[out] dst_ptr                            Pointer to the destination tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                       Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                         dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                       Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                         dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination tensor
- * @param[out] sum_ptr                            Pointer to the sum values tensor slice. Supported data types: same as @p src_ptr
- * @param[in]  sum_stride_x                       Stride of the sum values tensor in X dimension (in bytes)
- * @param[in]  sum_step_x                         sum_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  sum_stride_y                       Stride of the sum values tensor in Y dimension (in bytes)
- * @param[in]  sum_step_y                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_stride_z                       Stride of the sum values tensor in Z dimension (in bytes)
- * @param[in]  sum_step_z                         sum_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  sum_offset_first_element_in_bytes  The offset of the first element in the sum values tensor
- */
-__kernel void softmax_layer_max_shift_exp_sum_quantized_parallel(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(maxo),
-    TENSOR3D_DECLARATION(dst),
-    TENSOR3D_DECLARATION(sum))
-{
-    const uint lid    = get_local_id(0);
-    const uint x_offs = (VECTOR_SIZE_LEFTOVER + lid * VECTOR_SIZE);
-
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(int) + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
-
-    Image maxo = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(maxo);
-    Image sum  = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(sum);
-
-    // Define one temporary vector per work-item.
-    __local VEC_INT tmp_local[GRID_SIZE];
-    __local DATA_TYPE max_local;
-
-    VEC_BASE vec_min_val = (VEC_BASE)(MIN_VALUE);
-    VEC_BASE max_val_vec = vec_min_val;
-
-    // Number of iterations per work-item.
-    const uint width = (SRC_WIDTH / GRID_SIZE) >> LOG_VECTOR_SIZE;
-    // Calculate max of row
-    uint i = 0;
-    for(; i < width; ++i)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    // How many work-items needed to complete the computation.
-    int boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        max_val_vec       = max(data_max, max_val_vec);
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    VEC_INT widx;
-    if(lid == 0)
-    {
-        // Handle non multiple of 4
-        VEC_BASE data_max = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        widx              = (VEC_INT)VECTOR_SIZE_LEFTOVER > VEC_OFFS(int, VECTOR_SIZE);
-        max_val_vec       = max(max_val_vec, select(vec_min_val, data_max, CONVERT(widx, VEC_BASE)));
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = CONVERT(max_val_vec, VEC_INT);
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 128], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 64], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 32], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 16], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 8], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 4], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] = max(tmp_local[lid + 2], tmp_local[lid]);
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        max_val_vec = max(CONVERT((tmp_local[lid + 1]), VEC_BASE), CONVERT((tmp_local[lid]), VEC_BASE));
-        max_local   = MAX_REDUCE(max_val_vec, VECTOR_SIZE);
-    }
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    /* Second section */
-
-    // Set sum vector
-    VEC_INT sum1D   = 0;
-    int     max_val = convert_int(max_local);
-
-    // Shift values, exp and sum
-    for(i = 0; i < width; ++i)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_GRID_SIZE
-    boundary_workitems = (SRC_WIDTH % (GRID_SIZE * VECTOR_SIZE)) / VECTOR_SIZE;
-    if(lid < boundary_workitems)
-    {
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE(VECTOR_SIZE)
-        (data_diff, 0, (__global int *)(dst_addr + (i * GRID_SIZE * VECTOR_SIZE) * sizeof(int)));
-        sum1D = sum1D + select(0, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-    }
-#ifdef NON_MULTIPLE_OF_VECTOR_SIZE
-    if(lid == 0)
-    {
-        // Handle non multiple of vector size ((GRID_SIZE * i * 4) + 4, 0); move 4 float positions ahead, *4 is due to the stride
-        VEC_BASE data          = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(src_addr - VECTOR_SIZE_LEFTOVER * sizeof(DATA_TYPE)));
-        VEC_INT data_fp        = CONVERT(data, VEC_INT);
-        VEC_INT data_diff      = data_fp - max_val;
-        VEC_INT data_diff_mult = mult_by_quantized_multiplier(data_diff);
-        data_fp                = ASYMM_EXP_ON_NEGATIVE_VALUES(data_diff_mult, SCALED_DIFF_INT_BITS, VECTOR_SIZE);
-        data_fp                = ASYMM_RESCALE(data_fp, 0, EXP_ACCUMULATION_INT_BITS, VECTOR_SIZE);
-        VSTORE_PARTIAL(VECTOR_SIZE, VECTOR_SIZE_LEFTOVER)
-        (data_diff, 0, (__global int *)(dst_addr - VECTOR_SIZE_LEFTOVER * sizeof(int)));
-        data_fp = select(MIN_VALUE, data_fp, data_diff >= (VEC_INT)(DIFF_MIN));
-        data_fp = select(0, data_fp, widx);
-        sum1D   = sum1D + data_fp;
-    }
-#endif /* NON_MULTIPLE_OF_VECTOR_SIZE */
-#endif /* NON_MULTIPLE_OF_GRID_SIZE */
-    tmp_local[lid] = sum1D;
-
-    barrier(CLK_LOCAL_MEM_FENCE);
-
-    if(GRID_SIZE >= 256)
-    {
-        if(lid < 128)
-        {
-            tmp_local[lid] += tmp_local[lid + 128];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 128)
-    {
-        if(lid < 64)
-        {
-            tmp_local[lid] += tmp_local[lid + 64];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 64)
-    {
-        if(lid < 32)
-        {
-            tmp_local[lid] += tmp_local[lid + 32];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 32)
-    {
-        if(lid < 16)
-        {
-            tmp_local[lid] += tmp_local[lid + 16];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 16)
-    {
-        if(lid < 8)
-        {
-            tmp_local[lid] += tmp_local[lid + 8];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 8)
-    {
-        if(lid < 4)
-        {
-            tmp_local[lid] += tmp_local[lid + 4];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(GRID_SIZE >= 4)
-    {
-        if(lid < 2)
-        {
-            tmp_local[lid] += tmp_local[lid + 2];
-        }
-        barrier(CLK_LOCAL_MEM_FENCE);
-    }
-    if(lid == 0)
-    {
-        sum1D = (tmp_local[lid + 1] + tmp_local[lid]);
-        // Perform sum reduction
-        *((__global int *)sum.ptr) = SUM_REDUCE(sum1D, VECTOR_SIZE);
-    }
-}
-#endif // #if defined(SRC_WIDTH) && defined(LOG_VECTOR_SIZE)
-#endif /* defined(DATA_TYPE) && defined(DIFF_MIN) && defined(VECTOR_SIZE) && defined(VECTOR_SIZE_LEFTOVER) && defined(MIN_VALUE) */
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
deleted file mode 100644
index cb11786ac4..0000000000
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ /dev/null
@@ -1,280 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
-/** Calculate the space to batch conversion.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
- *
- * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
- * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
- * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
- * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
- * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
- * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
- * @param[in]  batch_id                                  The output tensor batch id
- * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
- */
-__kernel void space_to_batch_nchw(
-    TENSOR4D_DECLARATION(input),
-    IMAGE_DECLARATION(paddings),
-    VECTOR_DECLARATION(block_shape),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
-    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
-    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
-    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
-
-    int block_x = *((__global int *)vector_offset(&block, 0));
-    int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int out_x = get_global_id(0);
-    const int out_y = get_global_id(1);
-    const int z     = get_global_id(2);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - pad_left_x;
-        const int in_y = pos_y - pad_left_y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
-    }
-}
-/** Calculate the space to batch conversion. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
- *
- * @param[in]  input_ptr                                 Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                            Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                              input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                            Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                              input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                            Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                              input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes       The offset of the first element in the first source image
- * @param[in]  paddings_ptr                              Pointer to the second source image. Supported data types: S32
- * @param[in]  paddings_stride_x                         Stride of the paddinds tensor in X dimension (in bytes)
- * @param[in]  paddings_step_x                           paddings_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  paddings_stride_y                         Stride of the paddinds tensor in Y dimension (in bytes)
- * @param[in]  paddings_step_y                           paddings_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  paddingse_offset_first_element_in_bytes   The offset of the first element in the second source image
- * @param[in]  block_shape_ptr                           Pointer to the block shape tensor. Supported data types: S32
- * @param[in]  block_shape_stride_x                      Stride of the block shape tensor in X dimension (in bytes)
- * @param[in]  block_shape_step_x                        block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
- * @param[in]  batch_id                                  The output tensor batch id
- * @param[out] output_ptr                                Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                           Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                             output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                           Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                             output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                           Stride of the destination tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                             output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes      The offset of the first element in the destination image
- */
-__kernel void space_to_batch_nhwc(
-    TENSOR4D_DECLARATION(input),
-    IMAGE_DECLARATION(paddings),
-    VECTOR_DECLARATION(block_shape),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in    = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Image    pad   = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
-    Vector   block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-    Tensor3D out   = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int pad_left_x  = *((__global int *)offset(&pad, 0, 0));
-    const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
-    const int pad_left_y  = *((__global int *)offset(&pad, 0, 1));
-    const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
-
-    int block_x = *((__global int *)vector_offset(&block, 0));
-    int block_y = *((__global int *)vector_offset(&block, 1));
-
-    const int out_x = get_global_id(1);
-    const int out_y = get_global_id(2);
-    const int z     = get_global_id(0);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - pad_left_x;
-        const int in_y = pos_y - pad_left_y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
-    }
-}
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
-
-#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
-/** Calculate the space to batch conversion.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
- * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
- * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
- * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[in]  batch_id                             The output tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_batch_static_nchw(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int block_x = BLOCK_SHAPE_X;
-    int block_y = BLOCK_SHAPE_Y;
-
-    const int out_x = get_global_id(0);
-    const int out_y = get_global_id(1);
-    const int z     = get_global_id(2);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - PAD_LEFT_X;
-        const int in_y = pos_y - PAD_LEFT_Y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
-    }
-}
-/** Calculate the space to batch conversion. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
- * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
- * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
- * @note The ending pad value of  y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source image
- * @param[in]  batch_id                             The output tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_batch_static_nhwc(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    int block_x = BLOCK_SHAPE_X;
-    int block_y = BLOCK_SHAPE_Y;
-
-    const int out_x = get_global_id(1);
-    const int out_y = get_global_id(2);
-    const int z     = get_global_id(0);
-
-    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
-    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
-    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
-    {
-        const int w    = batch_id % BATCH_IN;
-        const int in_x = pos_x - PAD_LEFT_X;
-        const int in_y = pos_y - PAD_LEFT_Y;
-
-        *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
-    }
-}
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/space_to_depth.cl b/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index 1217a37345..0000000000
--- a/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Space to depth transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void space_to_depth_nchw(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-    const int z = get_global_id(2) % r;
-
-    const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
-    const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
-}
-/** Space to depth transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
- *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first source tensor
- * @param[in]  batch_id                             The input tensor batch id
- * @param[out] output_ptr                           Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void space_to_depth_nhwc(
-    TENSOR4D_DECLARATION(input),
-    const int batch_id,
-    TENSOR3D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-    const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
-    const int x = get_global_id(1);
-    const int y = get_global_id(2);
-    const int z = get_global_id(0) % r;
-
-    const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
-    const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
-
-    *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
-}
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/stack_layer.cl
deleted file mode 100644
index 438e858df2..0000000000
--- a/src/core/CL/cl_kernels/stack_layer.cl
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
-
-#if AXIS == 0
-#define X_DST (idx_input)
-#define Y_DST (x_src)
-#define Z_DST (y_src)
-#define W_DST (z_src)
-#define K_DST (w_src)
-#elif AXIS == 1 // AXIS == 1
-#define X_DST (x_src)
-#define Y_DST (idx_input)
-#define Z_DST (y_src)
-#define W_DST (z_src)
-#define K_DST (w_src)
-#elif AXIS == 2 // AXIS == 2
-#define X_DST (x_src)
-#define Y_DST (y_src)
-#define Z_DST (idx_input)
-#define W_DST (z_src)
-#define K_DST (w_src)
-#elif AXIS == 3 // AXIS == 3
-#define X_DST (x_src)
-#define Y_DST (y_src)
-#define Z_DST (z_src)
-#define W_DST (idx_input)
-#define K_DST (w_src)
-#elif AXIS == 4 // AXIS == 4
-#define X_DST (x_src)
-#define Y_DST (y_src)
-#define Z_DST (z_src)
-#define W_DST (w_src)
-#define K_DST (idx_input)
-#else // AXIS not supported
-#error "Not supported axis"
-#endif // AXIS == 0
-
-/** OpenCL kernel to stack a rank-R tensor into one with rank-(R+1) along the axis dimension
- *
- * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float
- * @note The dimension to stack the tensors along has to be passed at compile time using -DAXIS. i.e. -DAXIS=1
- * @note Dimension 2 of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=112)
- * @note Dimension 3 of the output tensor must be passed at compile time using -DDST_DIM3 (e.g. -DDST_DIM3=112)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  idx_input                         Index of the input tensor in the list of tensors to stack
- */
-__kernel void stack_layer(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-    unsigned int idx_input)
-{
-    uint x_src = get_global_id(0);
-    uint y_src = get_global_id(1);
-    uint z_src = (get_global_id(2) % SRC_DIM2);
-    uint w_src = (get_global_id(2) / SRC_DIM2);
-
-    __global DATA_TYPE *src = (__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + x_src * sizeof(DATA_TYPE) + y_src * src_stride_y + z_src * src_stride_z + w_src * src_stride_w);
-
-    __global DATA_TYPE *dst = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + X_DST * sizeof(DATA_TYPE) + Y_DST * dst_stride_y + Z_DST * dst_stride_z + W_DST * dst_stride_w + K_DST *
-                                                     dst_stride_w * (uint)DST_DIM3);
-
-    *dst = *src;
-}
-
-#undef X_DST
-#undef Y_DST
-#undef Z_DST
-#undef W_DST
-#endif // defined(DATA_TYPE) && defined(AXIS) && defined(SRC_DIM2) && defined(DST_DIM3)
diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/tile.cl
deleted file mode 100644
index 79da7fe6b9..0000000000
--- a/src/core/CL/cl_kernels/tile.cl
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#if defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
-/** Perform a floor operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: All
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void tile(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DST_DEPTH);
-    Tensor4D input  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, SRC_DEPTH);
-
-    // For all coordinates but x, each tile copies from the input
-    const int y     = get_global_id(1);
-    const int z     = get_global_id(2) % DST_DEPTH;
-    const int batch = get_global_id(2) / DST_DEPTH;
-
-#if defined(VEC_SIZE) && defined(OFFSET)
-    // If we are loading/storing multiple elements at time, we need to
-    // not exceed the input boundaries. The last threads need to backtrack
-    // of OFFSET elements. Those elements cumulates for previous tiles
-    const int id = (int)(get_global_id(0));
-    int       x  = id * VEC_SIZE;
-
-    // Shift x based on the previous offsets
-    const int tile_number = x / SRC_WIDTH;
-    x -= (tile_number) * OFFSET;
-    int x_input = x % SRC_WIDTH;
-
-    // Shift x based on being the last tile
-    const int last_tile = (int)(x_input + VEC_SIZE > SRC_WIDTH);
-    x -= last_tile * OFFSET;
-    x_input = x % SRC_WIDTH;
-    output.ptr -= (tile_number + last_tile) * OFFSET * output_stride_x;
-
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x_input, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
-
-    // Copy the data
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr);
-
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)output.ptr);
-#else  // !defined(VEC_SIZE) || !defined(OFFSET)
-    const int x = get_global_id(0);
-
-    // Update the input pointer
-    input.ptr = tensor4D_offset(&input, x % SRC_WIDTH, y % SRC_HEIGHT, z % SRC_DEPTH, batch % SRC_BATCHES);
-
-    *((__global DATA_TYPE *)(output.ptr)) = *((__global DATA_TYPE *)(input.ptr));
-#endif // defined(VEC_SIZE) && defined(OFFSET)
-}
-#endif // defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(SRC_DEPTH) && defined(DST_DEPTH)
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/transpose.cl
deleted file mode 100644
index 82db2908b5..0000000000
--- a/src/core/CL/cl_kernels/transpose.cl
+++ /dev/null
@@ -1,240 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#define PARTIAL_STORE_M0 VEC_SIZE_LEFTOVER_X
-#define PARTIAL_STORE_N0 VEC_SIZE_LEFTOVER_Y
-
-#include "helpers.h"
-#include "repeat.h"
-
-#if defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
-
-#if VEC_SIZE_X == 1
-#if VEC_SIZE_Y == 1
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0                \
-    }
-#elif VEC_SIZE_Y == 2
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0, u1            \
-    }
-#elif VEC_SIZE_Y == 3
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0, u1, u2        \
-    }
-#elif VEC_SIZE_Y == 4
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0, u1, u2, u3    \
-    }
-#elif VEC_SIZE_Y == 8
-#define TRANSPOSED_U(val)              \
-    {                                  \
-        u0, u1, u2, u3, u4, u5, u6, u7 \
-    }
-#elif VEC_SIZE_Y == 16
-#define TRANSPOSED_U(val)                        \
-    {                                            \
-        u0, u1, u2, u3, u4, u5, u6, u7,          \
-        u8, u9, u10, u11, u12, u13, u14, u15 \
-    }
-#endif /* switch VEC_SIZE_Y */
-#else  // VEC_SIZE_X == 1
-#if VEC_SIZE_Y == 1
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0.val            \
-    }
-#elif VEC_SIZE_Y == 2
-#define TRANSPOSED_U(val) \
-    {                     \
-        u0.val, u1.val    \
-    }
-#elif VEC_SIZE_Y == 3
-#define TRANSPOSED_U(val)      \
-    {                          \
-        u0.val, u1.val, u2.val \
-    }
-#elif VEC_SIZE_Y == 4
-#define TRANSPOSED_U(val)              \
-    {                                  \
-        u0.val, u1.val, u2.val, u3.val \
-    }
-#elif VEC_SIZE_Y == 8
-#define TRANSPOSED_U(val)                                              \
-    {                                                                  \
-        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val \
-    }
-#elif VEC_SIZE_Y == 16
-#define TRANSPOSED_U(val)                                                        \
-    {                                                                            \
-        u0.val, u1.val, u2.val, u3.val, u4.val, u5.val, u6.val, u7.val,          \
-        u8.val, u9.val, u10.val, u11.val, u12.val, u13.val, u14.val, u15.val \
-    }
-#endif /* switch VEC_SIZE_Y */
-#endif // VEC_SIZE_X == 1
-
-#if DATA_TYPE_IN_BYTES == 4
-#define DATA_TYPE uint
-#elif DATA_TYPE_IN_BYTES == 2
-#define DATA_TYPE ushort
-#elif DATA_TYPE_IN_BYTES == 1
-#define DATA_TYPE uchar
-#else /* switch DATA_TYPE_IN_BYTES */
-#error DATA_TYPE_IN_BYTES not supported for transpose
-#endif /* switch DATA_TYPE_IN_BYTES */
-
-/** This OpenCL kernel computes the matrix transposition of input matrix
- *
- * @note The number of bytes of the data type need to be passed at compile time using -DDATA_TYPE_IN_BYTES. DATA_TYPE_IN_BYTES can be:
- *  -# -DDATA_TYPE_IN_BYTES=1 for transposing U8 or S8 matrices
- *  -# -DDATA_TYPE_IN_BYTES=2 for transposing U16, S16 or FP16 matrices
- *  -# -DDATA_TYPE_IN_BYTES=4 for transposing U32, S32 or FP32 matrices
- *  -# -DVEC_SIZE_X is the number of elements processed in X dimension
- *  -# -DVEC_SIZE_LEFTOVER_X is the leftover size in the X dimension; x_dimension % VEC_SIZE_X
- *  -# -DVEC_SIZE_Y is the number of elements processed in Y dimension
- *  -# -DVEC_SIZE_LEFTOVER_Y is the leftover size in the Y dimension; y_dimension % VEC_SIZE_Y
- *
- *
- * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source matrix
- * @param[out] dst_ptr                           Pointer to the destination matrix Supported data type: same as src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination matrix
- */
-__kernel void transpose(IMAGE_DECLARATION(src),
-                        IMAGE_DECLARATION(dst))
-{
-    uint x_offs = max((int)(get_global_id(0) * VEC_SIZE_X - (VEC_SIZE_X - VEC_SIZE_LEFTOVER_X) % VEC_SIZE_X), 0);
-    uint y_offs = max((int)(get_global_id(1) * VEC_SIZE_Y - (VEC_SIZE_Y - VEC_SIZE_LEFTOVER_Y) % VEC_SIZE_Y), 0);
-
-    // Compute addresses
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * DATA_TYPE_IN_BYTES + y_offs * src_stride_y;
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + y_offs * DATA_TYPE_IN_BYTES + x_offs * dst_stride_y;
-
-    // Load the NxM block at (x, y)
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u0 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)src_addr);
-#if VEC_SIZE_Y > 1
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u1 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + src_stride_y));
-#endif /* VEC_SIZE_Y > 1 */
-#if VEC_SIZE_Y > 2
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u2 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif /* VEC_SIZE_Y > 2 */
-#if VEC_SIZE_Y > 3
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u3 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif /* VEC_SIZE_Y > 3 */
-#if VEC_SIZE_Y > 4
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u4 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u5 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u6 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u7 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
-#endif /* VEC_SIZE_Y > 4 */
-#if VEC_SIZE_Y > 8
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u8 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 8 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u9 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 9 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u10 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 10 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u11 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 11 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u12 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 12 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u13 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 13 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u14 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 14 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_X)
-    u15 = VLOAD(VEC_SIZE_X)(0, (__global DATA_TYPE *)(src_addr + 15 * src_stride_y));
-#endif /* VEC_SIZE_Y > 8 */
-
-    //Create transposed vectors
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t0 = TRANSPOSED_U(s0);
-#if VEC_SIZE_X > 1
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t1 = TRANSPOSED_U(s1);
-#endif /* VEC_SIZE_X > 1 */
-#if VEC_SIZE_X > 2
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t2 = TRANSPOSED_U(s2);
-#endif /* VEC_SIZE_X > 2 */
-#if VEC_SIZE_X > 3
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t3 = TRANSPOSED_U(s3);
-#endif /* VEC_SIZE_X > 3 */
-#if VEC_SIZE_X > 4
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t4 = TRANSPOSED_U(s4);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t5 = TRANSPOSED_U(s5);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t6 = TRANSPOSED_U(s6);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t7 = TRANSPOSED_U(s7);
-#endif /* VEC_SIZE_X > 4 */
-#if VEC_SIZE_X > 8
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t8 = TRANSPOSED_U(s8);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    t9 = TRANSPOSED_U(s9);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tA = TRANSPOSED_U(sA);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tB = TRANSPOSED_U(sB);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tC = TRANSPOSED_U(sC);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tD = TRANSPOSED_U(sD);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tE = TRANSPOSED_U(sE);
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE_Y)
-    tF = TRANSPOSED_U(sF);
-#endif /* VEC_SIZE_X > 8 */
-
-    // Store the block at (y, x)
-    REPEAT_VAR_INIT_TO_CONST(VEC_SIZE_X, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;
-    STORE_BLOCK_BOUNDARY_AWARE(VEC_SIZE_X, VEC_SIZE_Y, DATA_TYPE, t, (__global uchar *)dst_addr, dst_stride_y, zout, VEC_SIZE_LEFTOVER_X, VEC_SIZE_LEFTOVER_Y, VEC_SIZE_LEFTOVER_X != 0
-                               && get_global_id(0) == 0,
-                               VEC_SIZE_LEFTOVER_Y != 0 && get_global_id(1) == 0);
-}
-
-#endif // defined(DATA_TYPE_IN_BYTES) && defined(VEC_SIZE_X) && defined(VEC_SIZE_LEFTOVER_X) && defined(VEC_SIZE_Y) && defined(VEC_SIZE_LEFTOVER_Y)
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/unpooling_layer.cl b/src/core/CL/cl_kernels/unpooling_layer.cl
deleted file mode 100644
index 457e9bf8f1..0000000000
--- a/src/core/CL/cl_kernels/unpooling_layer.cl
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-/** Performs max unpooling function with pool size equal to 2.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32
- * @note The width of the output tensor must be passed using -DWIDTH_DST e.g. -DWIDTH_DST=24
- * @note The height of the output tensor must be passed using -DHEIGHT_DST e.g. -DHEIGHT_DST=54
- * @note The depth of the output tensor must be passed using -DDEPTH_DST e.g. -DDEPTH_DST=32
- *
- * @param[in]  input_ptr                             Pointer to the source tensor. Supported data types: F16/F32
- * @param[in]  input_stride_x                        Stride of the source tensor in X dimension (in bytes)
- * @param[in]  input_step_x                          input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                        Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  input_step_y                          input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                        Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                          input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes   The offset of the first element in the source tensor
- * @param[out] output_ptr                            Pointer to the output tensor. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the output tensor in X dimension (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the output tensor in Y dimension (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the output tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the output tensor
- * @param[in]  indices_ptr                           Pointer to the indices tensor. Supported data types: U32
- * @param[in]  indices_stride_x                      Stride of the indices tensor in X dimension (in bytes)
- * @param[in]  indices_step_x                        indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  indices_stride_y                      Stride of the indices tensor in Y dimension (in bytes)
- * @param[in]  indices_step_y                        indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  indices_stride_z                      Stride of the indices tensor in Z dimension (in bytes)
- * @param[in]  indices_step_z                        indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void max_unpooling_layer_2(
-    TENSOR3D_DECLARATION(input),
-    TENSOR3D_DECLARATION(output),
-    TENSOR3D_DECLARATION(indices))
-{
-    Tensor3D input   = CONVERT_TO_TENSOR3D_STRUCT(input);
-    Tensor3D output  = CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(output);
-    Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
-    unsigned int index = *((__global unsigned int *)indices.ptr);
-    DATA_TYPE value    = *((__global DATA_TYPE *)input.ptr);
-
-    *((__global DATA_TYPE *)tensor3D_index2ptr(&output, WIDTH_DST, HEIGHT_DST, DEPTH_DST, index)) = value;
-}
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/upsample_layer.cl
deleted file mode 100644
index d0cc0f24b7..0000000000
--- a/src/core/CL/cl_kernels/upsample_layer.cl
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/** This function applies upsample on an input image. (NCHW)
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: All
- * -# -DVEC_SIZE_IN = Input vector size
- * -# -DVEC_SIZE_OUT = Output vector size
- * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
- * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void upsample_layer_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
-    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
-    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
-    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    data = vload8(0, (__global DATA_TYPE *)src.ptr);
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
-
-    vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
-    vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
-#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
-#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
-
-/** This function applies upsample on an input image. (NHWC)
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: All
- * -# -DVEC_SIZE_IN = Input vector size
- * -# -DVEC_SIZE_OUT = Output vector size
- * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
- * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in]  src_ptr                           Pointer to the source image. Supported data types: All
- * @param[in]  src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr                           Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void upsample_layer_nhwc(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    // Check if access on width gets out of bounds
-    // If it does shift access vector to access elements within bounds
-    const int xi_in  = (int)(get_global_id(0) * VEC_SIZE_IN);
-    const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
-    src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
-    dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
-
-    VEC_DATA_TYPE(DATA_TYPE, 16)
-    data = vload16(0, (__global DATA_TYPE *)src.ptr);
-
-    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0));
-    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
-    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1));
-    vstore16(data, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1));
-#else  // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
-    *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
-#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/winograd_filter_transform.cl
deleted file mode 100644
index 5c3bb8aa9b..0000000000
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ /dev/null
@@ -1,1952 +0,0 @@
-/*
- * Copyright (c) 2018-2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(SRC_DIM_Z)
-
-#define OUTPUT_ROW_2x2_7x7(out, tmp)                                                                                               \
-    ({                                                                                                                             \
-        out.s0 = -tmp.s0 / 36.f;                                                                                                   \
-        out.s1 = (tmp.s0 - tmp.s1 + tmp.s2 - tmp.s3 + tmp.s4 - tmp.s5 + tmp.s6) / 48.f;                                            \
-        out.s2 = (tmp.s0 + tmp.s1 + tmp.s2 + tmp.s3 + tmp.s4 + tmp.s5 + tmp.s6) / 48.f;                                            \
-        out.s3 = (-tmp.s0 + 2.f * tmp.s1 - 4.f * tmp.s2 + 8.f * tmp.s3 - 16.f * tmp.s4 + 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
-        out.s4 = (-tmp.s0 - 2.f * tmp.s1 - 4.f * tmp.s2 - 8.f * tmp.s3 - 16.f * tmp.s4 - 32.f * tmp.s5 - 64.f * tmp.s6) / 120.f;   \
-        out.s5 = (tmp.s0 - 3.f * tmp.s1 + 9.f * tmp.s2 - 27.f * tmp.s3 + 81.f * tmp.s4 - 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
-        out.s6 = (tmp.s0 + 3.f * tmp.s1 + 9.f * tmp.s2 + 27.f * tmp.s3 + 81.f * tmp.s4 + 243.f * tmp.s5 + 729.f * tmp.s6) / 720.f; \
-        out.s7 = tmp.s6;                                                                                                           \
-    })
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x2_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0    = 0.0f;
-    out0.s0 = (w0.s0);
-    out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
-    out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
-    out0.s3 = (w0.s2);
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out1    = 0.0f;
-    out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
-    out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out2    = 0.0f;
-    out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
-    out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
-    out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
-    out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out3    = 0.0f;
-    out3.s0 = (w2.s0);
-    out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
-    out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
-    out3.s3 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 16 channels for 3x3 kernels
-    // 4 channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                       *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 3)
-    w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = (w0.s0) / 16.f;
-    out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
-    out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
-    out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
-    out0.s5 = (w0.s2) / 4.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
-    out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
-    out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
-    out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
-    out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
-    out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
-    out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
-    out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
-    out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
-    out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (w2.s0) / 4.f;
-    out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
-    out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
-    out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
-    out5.s5 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 36 channels for 3x3 kernels
-    // 6 channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_3x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z + 2 * src_stride_y));
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    // Row 0
-    DATA_TYPE out00, out01, out02, out03, out04, out05;
-    out00 = (w00) / 16.f;
-    out01 = (-w00 - w01 - w02) / 24.f;
-    out02 = (-w00 + w01 - w02) / 24.f;
-    out03 = (w00 + 2.f * w01 + 4.f * w02) / 96.f;
-    out04 = (w00 - 2.f * w01 + 4.f * w02) / 96.f;
-    out05 = (w02) / 4.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    DATA_TYPE out10, out11, out12, out13, out14, out15;
-    out10 = (-w00 - w10 - w20) / 24.f;
-    out11 = (w00 + w10 + w20 + w01 + w11 + w21 + w02 + w12 + w22) / 36.f;
-    out12 = (w00 + w10 + w20 - w01 - w11 - w21 + w02 + w12 + w22) / 36.f;
-    out13 = (-w00 - w10 - w20 + 2.f * (-w01 - w11 - w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
-    out14 = (-w00 - w10 - w20 + 2.f * (w01 + w11 + w21) + 4.f * (-w02 - w12 - w22)) / 144.f;
-    out15 = (-w02 - w12 - w22) / 6.f;
-
-    // Row 2
-    DATA_TYPE out20, out21, out22, out23, out24, out25;
-    out20 = (-w00 + w10 - w20) / 24.f;
-    out21 = (w00 - w10 + w20 + w01 - w11 + w21 + w02 - w12 + w22) / 36.f;
-    out22 = (w00 - w10 + w20 - w01 + w11 - w21 + w02 - w12 + w22) / 36.f;
-    out23 = (-w00 + w10 - w20 + 2.f * (-w01 + w11 - w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
-    out24 = (-w00 + w10 - w20 + 2.f * (w01 - w11 + w21) + 4.f * (-w02 + w12 - w22)) / 144.f;
-    out25 = (-w02 + w12 - w22) / 6.f;
-
-    // Row 3
-    DATA_TYPE out30, out31, out32, out33, out34, out35;
-    out30 = (w00 + 2.f * w10 + 4.f * w20) / 96.f;
-    out31 = (-w00 - 2.f * w10 - 4.f * w20 - w01 - 2.f * w11 - 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
-    out32 = (-w00 - 2.f * w10 - 4.f * w20 + w01 + 2.f * w11 + 4.f * w21 - w02 - 2.f * w12 - 4.f * w22) / 144.f;
-    out33 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (w01 + 2.f * w11 + 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
-    out34 = ((w00 + 2.f * w10 + 4.f * w20) + 2.f * (-w01 - 2.f * w11 - 4.f * w21) + 4.f * (w02 + 2.f * w12 + 4.f * w22)) / 576.f;
-    out35 = (w02 + 2.f * w12 + 4.f * w22) / 24.f;
-
-    // Row 4
-    DATA_TYPE out40, out41, out42, out43, out44, out45;
-    out40 = (w00 - 2.f * w10 + 4.f * w20) / 96.f;
-    out41 = (-w00 + 2.f * w10 - 4.f * w20 - w01 + 2.f * w11 - 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
-    out42 = (-w00 + 2.f * w10 - 4.f * w20 + w01 - 2.f * w11 + 4.f * w21 - w02 + 2.f * w12 - 4.f * w22) / 144.f;
-    out43 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (w01 - 2.f * w11 + 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
-    out44 = ((w00 - 2.f * w10 + 4.f * w20) + 2.f * (-w01 + 2.f * w11 - 4.f * w21) + 4.f * (w02 - 2.f * w12 + 4.f * w22)) / 576.f;
-    out45 = (w02 - 2.f * w12 + 4.f * w22) / 24.f;
-
-    // Row 5
-    DATA_TYPE out50, out51, out52, out53, out54, out55;
-    out50 = (w20) / 4.f;
-    out51 = (-w20 - w21 - w22) / 6.f;
-    out52 = (-w20 + w21 - w22) / 6.f;
-    out53 = (w20 + 2.f * w21 + 4.f * w22) / 24.f;
-    out54 = (w20 - 2.f * w21 + 4.f * w22) / 24.f;
-    out55 = (w22);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int x0 = get_global_id(2); // idx filter
-    int y0 = get_global_id(0); // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    // 36 channels for 3x3 kernels
-    // 6  channels for 3x1 or 1x3 kernels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out00;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out01;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out02;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out03;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out04;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out05;
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)  = out10;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)  = out11;
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out12;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out13;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out14;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out15;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out20;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out21;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out22;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out23;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out24;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out25;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out30;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out31;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out32;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out33;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out34;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out35;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out40;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out41;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out42;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out43;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out44;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out45;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out50;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out51;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out52;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out53;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out54;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out55;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- *
- * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_5x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
-    // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w00           = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w10           = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w20           = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w30           = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    w40           = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-    // Transform the input tile
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = w00.s0;
-    out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
-    out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
-    out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
-    out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
-    out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
-    out0.s7 = w01;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
-    out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
-                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
-                     (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
-    out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
-    out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
-                (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
-    out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
-    out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
-                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
-                     (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
-    out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
-    out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
-                (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
-    out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
-    out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
-    out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
-    out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
-    out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
-                (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-                (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
-    out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
-    out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
-               (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
-               (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
-    out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
-    out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
-    out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
-    out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
-    out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
-
-    // Row 6
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out6    = 0.0f;
-    out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
-    out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
-                (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-                (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
-    out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
-    out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
-               (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
-               (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
-    out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
-
-    // Row 7
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out7    = 0.0f;
-    out7.s0 = w40.s0;
-    out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
-    out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
-    out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
-    out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
-    out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
-    out7.s7 = w41;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int z  = get_global_id(2);
-    int x0 = z / SRC_DIM_Z; // idx filter
-    int y0 = z % SRC_DIM_Z; // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_5x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
-
-#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Load the values from the input tensor
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Load the values from the input tensor
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0    = 0.0f;
-    out0.s0 = w00;
-    out0.s1 = -2.f * (w00 + w01 + w02 + w03 + w04) / 9.f;
-    out0.s2 = -2.f * (w00 - w01 + w02 - w03 + w04) / 9.f;
-    out0.s3 = (w00 + 2.f * w01 + 4.f * w02 + 8.f * w03 + 16.f * w04) / 90.f;
-    out0.s4 = (w00 - 2.f * w01 + 4.f * w02 - 8.f * w03 + 16.f * w04) / 90.f;
-    out0.s5 = (16.f * w00 + 8.f * w01 + 4.f * w02 + 2.f * w03 + w04) / 180.f;
-    out0.s6 = (16.f * w00 - 8.f * w01 + 4.f * w02 - 2.f * w03 + w04) / 180.f;
-    out0.s7 = w04;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1    = 0.0f;
-    out1.s0 = -2.f * (w00 + w10 + w20 + w30 + w40) / 9.f;
-    out1.s1 = 4.f * ((w00 + w10 + w20 + w30 + w40) + (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) + (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
-    out1.s2 = 4.f * ((w00 + w10 + w20 + w30 + w40) - (w01 + w11 + w21 + w31 + w41) + (w02 + w12 + w22 + w32 + w42) - (w03 + w13 + w23 + w33 + w43) + (w04 + w14 + w24 + w34 + w44)) / 81.f;
-    out1.s3 = -((w00 + w10 + w20 + w30 + w40) + 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
-                (w04 + w14 + w24 + w34 + w44)) / 405.f;
-    out1.s4 = -((w00 + w10 + w20 + w30 + w40) - 2.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 8.f * (w03 + w13 + w23 + w33 + w43) + 16.f *
-                (w04 + w14 + w24 + w34 + w44)) / 405.f;
-    out1.s5 = -(16.f * (w00 + w10 + w20 + w30 + w40) + 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) + 2.f * (w03 + w13 + w23 + w33 + w43) +
-                (w04 + w14 + w24 + w34 + w44)) / 810.f;
-    out1.s6 = -(16.f * (w00 + w10 + w20 + w30 + w40) - 8.f * (w01 + w11 + w21 + w31 + w41) + 4.f * (w02 + w12 + w22 + w32 + w42) - 2.f * (w03 + w13 + w23 + w33 + w43) +
-                (w04 + w14 + w24 + w34 + w44)) / 810.f;
-    out1.s7 = -2.f * (w04 + w14 + w24 + w34 + w44) / 9.f;
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2    = 0.0f;
-    out2.s0 = -2.f * (w00 - w10 + w20 - w30 + w40) / 9.f;
-    out2.s1 = 4.f * ((w00 - w10 + w20 - w30 + w40) + (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) + (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
-    out2.s2 = 4.f * ((w00 - w10 + w20 - w30 + w40) - (w01 - w11 + w21 - w31 + w41) + (w02 - w12 + w22 - w32 + w42) - (w03 - w13 + w23 - w33 + w43) + (w04 - w14 + w24 - w34 + w44)) / 81.f;
-    out2.s3 = -((w00 - w10 + w20 - w30 + w40) + 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
-                (w04 - w14 + w24 - w34 + w44)) / 405.f;
-    out2.s4 = -((w00 - w10 + w20 - w30 + w40) - 2.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 8.f * (w03 - w13 + w23 - w33 + w43) + 16.f *
-                (w04 - w14 + w24 - w34 + w44)) / 405.f;
-    out2.s5 = -(16.f * (w00 - w10 + w20 - w30 + w40) + 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) + 2.f * (w03 - w13 + w23 - w33 + w43) +
-                (w04 - w14 + w24 - w34 + w44)) / 810.f;
-    out2.s6 = -(16.f * (w00 - w10 + w20 - w30 + w40) - 8.f * (w01 - w11 + w21 - w31 + w41) + 4.f * (w02 - w12 + w22 - w32 + w42) - 2.f * (w03 - w13 + w23 - w33 + w43) +
-                (w04 - w14 + w24 - w34 + w44)) / 810.f;
-    out2.s7 = -2.f * (w04 - w14 + w24 - w34 + w44) / 9.f;
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3    = 0.0f;
-    out3.s0 = (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) / 90.f;
-    out3.s1 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) +
-                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
-    out3.s2 = -((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) -
-                (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 405.f;
-    out3.s3 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 8.f
-               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
-    out3.s4 = ((w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 2.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f * (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 8.f
-               * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + 16.f * (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 8100.f;
-    out3.s5 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) + 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
-               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) + 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
-    out3.s6 = (16.f * (w00 + 2.f * w10 + 4.f * w20 + 8.f * w30 + 16.f * w40) - 8.f * (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) + 4.f *
-               (w02 + 2.f * w12 + 4.f * w22 + 8.f * w32 + 16.f * w42) - 2.f * (w03 + 2.f * w13 + 4.f * w23 + 8.f * w33 + 16.f * w43) + (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44)) / 16200.f;
-    out3.s7 = (w04 + 2.f * w14 + 4.f * w24 + 8.f * w34 + 16.f * w44) / 90.f;
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4    = 0.0f;
-    out4.s0 = (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) / 90.f;
-    out4.s1 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) +
-                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
-    out4.s2 = -((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) -
-                (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 405.f;
-    out4.s3 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 8.f
-               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
-    out4.s4 = ((w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 2.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f * (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 8.f
-               * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + 16.f * (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 8100.f;
-    out4.s5 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) + 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
-               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) + 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
-    out4.s6 = (16.f * (w00 - 2.f * w10 + 4.f * w20 - 8.f * w30 + 16.f * w40) - 8.f * (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) + 4.f *
-               (w02 - 2.f * w12 + 4.f * w22 - 8.f * w32 + 16.f * w42) - 2.f * (w03 - 2.f * w13 + 4.f * w23 - 8.f * w33 + 16.f * w43) + (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44)) / 16200.f;
-    out4.s7 = (w04 - 2.f * w14 + 4.f * w24 - 8.f * w34 + 16.f * w44) / 90.f;
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5    = 0.0f;
-    out5.s0 = (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) / 180.f;
-    out5.s1 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) +
-                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
-    out5.s2 = -((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) -
-                (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 810.f;
-    out5.s3 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 8.f
-               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
-    out5.s4 = ((16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 2.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f * (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 8.f
-               * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + 16.f * (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 16200.f;
-    out5.s5 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) + 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
-               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) + 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
-    out5.s6 = (16.f * (16.f * w00 + 8.f * w10 + 4.f * w20 + 2.f * w30 + w40) - 8.f * (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) + 4.f *
-               (16.f * w02 + 8.f * w12 + 4.f * w22 + 2.f * w32 + w42) - 2.f * (16.f * w03 + 8.f * w13 + 4.f * w23 + 2.f * w33 + w43) + (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44)) / 32400.f;
-    out5.s7 = (16.f * w04 + 8.f * w14 + 4.f * w24 + 2.f * w34 + w44) / 180.f;
-
-    // Row 6
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out6    = 0.0f;
-    out6.s0 = (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) / 180.f;
-    out6.s1 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) +
-                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
-    out6.s2 = -((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) -
-                (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 810.f;
-    out6.s3 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 8.f
-               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
-    out6.s4 = ((16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 2.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f * (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 8.f
-               * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + 16.f * (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 16200.f;
-    out6.s5 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) + 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
-               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) + 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
-    out6.s6 = (16.f * (16.f * w00 - 8.f * w10 + 4.f * w20 - 2.f * w30 + w40) - 8.f * (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) + 4.f *
-               (16.f * w02 - 8.f * w12 + 4.f * w22 - 2.f * w32 + w42) - 2.f * (16.f * w03 - 8.f * w13 + 4.f * w23 - 2.f * w33 + w43) + (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44)) / 32400.f;
-    out6.s7 = (16.f * w04 - 8.f * w14 + 4.f * w24 - 2.f * w34 + w44) / 180.f;
-
-    // Row 7
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out7    = 0.0f;
-    out7.s0 = w40;
-    out7.s1 = -2.f * (w40 + w41 + w42 + w43 + w44) / 9.f;
-    out7.s2 = -2.f * (w40 - w41 + w42 - w43 + w44) / 9.f;
-    out7.s3 = (w40 + 2.f * w41 + 4.f * w42 + 8.f * w43 + 16.f * w44) / 90.f;
-    out7.s4 = (w40 - 2.f * w41 + 4.f * w42 - 8.f * w43 + 16.f * w44) / 90.f;
-    out7.s5 = (16.f * w40 + 8.f * w41 + 4.f * w42 + 2.f * w43 + w44) / 180.f;
-    out7.s6 = (16.f * w40 - 8.f * w41 + 4.f * w42 - 2.f * w43 + w44) / 180.f;
-    out7.s7 = w44;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int x0 = get_global_id(2); // idx filter
-    int y0 = get_global_id(0); // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-/** This OpenCL kernel performs Winograd filter transform 7x7/7x1 or 1x7 when the data layout is NHWC and the output tile is 2x2/2x1 or 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 7x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x7, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x2_7x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
-    const __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + get_global_id(0) * sizeof(DATA_TYPE) + get_global_id(1) * src_step_y + get_global_id(2) * src_step_w;
-
-#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Load the values from the input tensor
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-#else  // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    // Load the values from the input tensor
-    DATA_TYPE w00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    DATA_TYPE w02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    DATA_TYPE w03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    DATA_TYPE w04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    DATA_TYPE w05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    DATA_TYPE w06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    DATA_TYPE w10 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w12 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w13 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w14 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w15 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w16 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w20 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w22 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w23 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w24 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w25 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w26 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w30 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w32 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w33 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w34 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w35 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w36 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w40 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w42 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w43 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w44 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w45 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w46 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w50 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w51 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w52 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w53 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w54 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w55 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w56 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z + 6 * src_stride_y));
-
-    DATA_TYPE w60 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 0 * src_stride_y));
-    DATA_TYPE w61 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 1 * src_stride_y));
-    DATA_TYPE w62 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 2 * src_stride_y));
-    DATA_TYPE w63 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 3 * src_stride_y));
-    DATA_TYPE w64 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 4 * src_stride_y));
-    DATA_TYPE w65 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 5 * src_stride_y));
-    DATA_TYPE w66 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z + 6 * src_stride_y));
-
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp = 0.0f;
-
-    // Row 0
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0 = 0.0f;
-
-    out0.s0 = -w00 / 36.0f;
-    out0.s1 = (w00 - w01 + w02 - w03 + w04 - w05 + w06) / 48.f;
-    out0.s2 = (w00 + w01 + w02 + w03 + w04 + w05 + w06) / 48.f;
-    out0.s3 = (-w00 + 2.f * w01 - 4.f * w02 + 8.f * w03 - 16.f * w04 + 32.f * w05 - 64.f * w06) / 120.f;
-    out0.s4 = (-w00 - 2.f * w01 - 4.f * w02 - 8.f * w03 - 16.f * w04 - 32.f * w05 - 64.f * w06) / 120.f;
-    out0.s5 = (w00 - 3.f * w01 + 9.f * w02 - 27.f * w03 + 81.f * w04 - 243.f * w05 + 729.f * w06) / 720.f;
-    out0.s6 = (w00 + 3.f * w01 + 9.f * w02 + 27.f * w03 + 81.f * w04 + 243.f * w05 + 729.f * w06) / 720.f;
-    out0.s7 = w06;
-
-    out0 /= (VEC_DATA_TYPE(DATA_TYPE, 8)) - 36.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    // Row 1
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1 = 0.0f;
-
-    tmp.s0 = (w00 - w10 + w20 - w30 + w40 - w50 + w60) / 48.f;
-    tmp.s1 = (w01 - w11 + w21 - w31 + w41 - w51 + w61) / 48.f;
-    tmp.s2 = (w02 - w12 + w22 - w32 + w42 - w52 + w62) / 48.f;
-    tmp.s3 = (w03 - w13 + w23 - w33 + w43 - w53 + w63) / 48.f;
-    tmp.s4 = (w04 - w14 + w24 - w34 + w44 - w54 + w64) / 48.f;
-    tmp.s5 = (w05 - w15 + w25 - w35 + w45 - w55 + w65) / 48.f;
-    tmp.s6 = (w06 - w16 + w26 - w36 + w46 - w56 + w66) / 48.f;
-
-    OUTPUT_ROW_2x2_7x7(out1, tmp);
-
-    // Row 2
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out2 = 0.0f;
-
-    tmp.s0 = (w00 + w10 + w20 + w30 + w40 + w50 + w60) / 48.f;
-    tmp.s1 = (w01 + w11 + w21 + w31 + w41 + w51 + w61) / 48.f;
-    tmp.s2 = (w02 + w12 + w22 + w32 + w42 + w52 + w62) / 48.f;
-    tmp.s3 = (w03 + w13 + w23 + w33 + w43 + w53 + w63) / 48.f;
-    tmp.s4 = (w04 + w14 + w24 + w34 + w44 + w54 + w64) / 48.f;
-    tmp.s5 = (w05 + w15 + w25 + w35 + w45 + w55 + w65) / 48.f;
-    tmp.s6 = (w06 + w16 + w26 + w36 + w46 + w56 + w66) / 48.f;
-
-    OUTPUT_ROW_2x2_7x7(out2, tmp);
-
-    // Row 3
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out3 = 0.0f;
-
-    tmp.s0 = (-w00 + 2.f * w10 - 4.f * w20 + 8.f * w30 - 16.f * w40 + 32.f * w50 - 64.f * w60) / 120.f;
-    tmp.s1 = (-w01 + 2.f * w11 - 4.f * w21 + 8.f * w31 - 16.f * w41 + 32.f * w51 - 64.f * w61) / 120.f;
-    tmp.s2 = (-w02 + 2.f * w12 - 4.f * w22 + 8.f * w32 - 16.f * w42 + 32.f * w52 - 64.f * w62) / 120.f;
-    tmp.s3 = (-w03 + 2.f * w13 - 4.f * w23 + 8.f * w33 - 16.f * w43 + 32.f * w53 - 64.f * w63) / 120.f;
-    tmp.s4 = (-w04 + 2.f * w14 - 4.f * w24 + 8.f * w34 - 16.f * w44 + 32.f * w54 - 64.f * w64) / 120.f;
-    tmp.s5 = (-w05 + 2.f * w15 - 4.f * w25 + 8.f * w35 - 16.f * w45 + 32.f * w55 - 64.f * w65) / 120.f;
-    tmp.s6 = (-w06 + 2.f * w16 - 4.f * w26 + 8.f * w36 - 16.f * w46 + 32.f * w56 - 64.f * w66) / 120.f;
-
-    OUTPUT_ROW_2x2_7x7(out3, tmp);
-
-    // Row 4
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out4 = 0.0f;
-
-    tmp.s0 = (-w00 - 2.f * w10 - 4.f * w20 - 8.f * w30 - 16.f * w40 - 32.f * w50 - 64.f * w60) / 120.f;
-    tmp.s1 = (-w01 - 2.f * w11 - 4.f * w21 - 8.f * w31 - 16.f * w41 - 32.f * w51 - 64.f * w61) / 120.f;
-    tmp.s2 = (-w02 - 2.f * w12 - 4.f * w22 - 8.f * w32 - 16.f * w42 - 32.f * w52 - 64.f * w62) / 120.f;
-    tmp.s3 = (-w03 - 2.f * w13 - 4.f * w23 - 8.f * w33 - 16.f * w43 - 32.f * w53 - 64.f * w63) / 120.f;
-    tmp.s4 = (-w04 - 2.f * w14 - 4.f * w24 - 8.f * w34 - 16.f * w44 - 32.f * w54 - 64.f * w64) / 120.f;
-    tmp.s5 = (-w05 - 2.f * w15 - 4.f * w25 - 8.f * w35 - 16.f * w45 - 32.f * w55 - 64.f * w65) / 120.f;
-    tmp.s6 = (-w06 - 2.f * w16 - 4.f * w26 - 8.f * w36 - 16.f * w46 - 32.f * w56 - 64.f * w66) / 120.f;
-
-    OUTPUT_ROW_2x2_7x7(out4, tmp);
-
-    // Row 5
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out5 = 0.0f;
-
-    tmp.s0 = (w00 - 3.f * w10 + 9.f * w20 - 27.f * w30 + 81.f * w40 - 243.f * w50 + 729.f * w60) / 720.f;
-    tmp.s1 = (w01 - 3.f * w11 + 9.f * w21 - 27.f * w31 + 81.f * w41 - 243.f * w51 + 729.f * w61) / 720.f;
-    tmp.s2 = (w02 - 3.f * w12 + 9.f * w22 - 27.f * w32 + 81.f * w42 - 243.f * w52 + 729.f * w62) / 720.f;
-    tmp.s3 = (w03 - 3.f * w13 + 9.f * w23 - 27.f * w33 + 81.f * w43 - 243.f * w53 + 729.f * w63) / 720.f;
-    tmp.s4 = (w04 - 3.f * w14 + 9.f * w24 - 27.f * w34 + 81.f * w44 - 243.f * w54 + 729.f * w64) / 720.f;
-    tmp.s5 = (w05 - 3.f * w15 + 9.f * w25 - 27.f * w35 + 81.f * w45 - 243.f * w55 + 729.f * w65) / 720.f;
-    tmp.s6 = (w06 - 3.f * w16 + 9.f * w26 - 27.f * w36 + 81.f * w46 - 243.f * w56 + 729.f * w66) / 720.f;
-
-    OUTPUT_ROW_2x2_7x7(out5, tmp);
-
-    // Row 6
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out6 = 0.0f;
-
-    tmp.s0 = (w00 + 3.f * w10 + 9.f * w20 + 27.f * w30 + 81.f * w40 + 243.f * w50 + 729.f * w60) / 720.f;
-    tmp.s1 = (w01 + 3.f * w11 + 9.f * w21 + 27.f * w31 + 81.f * w41 + 243.f * w51 + 729.f * w61) / 720.f;
-    tmp.s2 = (w02 + 3.f * w12 + 9.f * w22 + 27.f * w32 + 81.f * w42 + 243.f * w52 + 729.f * w62) / 720.f;
-    tmp.s3 = (w03 + 3.f * w13 + 9.f * w23 + 27.f * w33 + 81.f * w43 + 243.f * w53 + 729.f * w63) / 720.f;
-    tmp.s4 = (w04 + 3.f * w14 + 9.f * w24 + 27.f * w34 + 81.f * w44 + 243.f * w54 + 729.f * w64) / 720.f;
-    tmp.s5 = (w05 + 3.f * w15 + 9.f * w25 + 27.f * w35 + 81.f * w45 + 243.f * w55 + 729.f * w65) / 720.f;
-    tmp.s6 = (w06 + 3.f * w16 + 9.f * w26 + 27.f * w36 + 81.f * w46 + 243.f * w56 + 729.f * w66) / 720.f;
-
-    OUTPUT_ROW_2x2_7x7(out6, tmp);
-
-    // Row 7
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out7 = 0.0f;
-
-    tmp.s0 = w60;
-    tmp.s1 = w61;
-    tmp.s2 = w62;
-    tmp.s3 = w63;
-    tmp.s4 = w64;
-    tmp.s5 = w65;
-    tmp.s6 = w66;
-
-    OUTPUT_ROW_2x2_7x7(out7, tmp);
-
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
-    int x0 = get_global_id(2); // idx filter
-    int y0 = get_global_id(0); // idx channel
-
-    // Get output address
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
-    // Store the values across the channels
-    *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
-    *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
-    *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
-    *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-    *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
-    *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-    *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
-    *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-    *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z)  = out1.s0;
-    *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z)  = out1.s1;
-    *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
-    *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
-    *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
-    *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
-    *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
-    *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
-    *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
-    *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
-    *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
-    *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
-    *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
-    *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
-    *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
-    *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
-    *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
-    *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
-    *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
-    *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
-    *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
-    *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
-    *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
-    *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
-    *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
-    *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
-    *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
-    *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
-    *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
-    *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
-    *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
-    *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
-    *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
-    *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
-    *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
-    *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
-    *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
-    *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
-    *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
-    *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
-    *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
-    *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
-    *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
-    *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
-    *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
-    *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
-    *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
-    *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
-    *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
-    *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
-    *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
-    *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
-    *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
-    *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
-    *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
-    *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-#endif // defined(SRC_DIM_Z)
-
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_5x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_3x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NHWC and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_5x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 7x1 when the data layout is NHWC and the output tile is 2x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x1_7x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
-#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x2_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NHWC and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x7 when the data layout is NHWC and the output tile is 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x2_1x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst))
-{
-    winograd_filter_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_offset_first_element_in_bytes);
-}
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/winograd_input_transform.cl
deleted file mode 100644
index fbb5e95196..0000000000
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ /dev/null
@@ -1,2233 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact)                     \
-    ({                                                              \
-        comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6;            \
-        comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5;            \
-        comm_fact.s2 = 2.5f * tmp.s3;                               \
-        comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
-        comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6;    \
-        comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4;        \
-        comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
-        \
-        out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
-        out.s1 = comm_fact.s0 + comm_fact.s1;                       \
-        out.s2 = comm_fact.s0 - comm_fact.s1;                       \
-        out.s3 = comm_fact.s3 + comm_fact.s4;                       \
-        out.s4 = comm_fact.s4 - comm_fact.s3;                       \
-        out.s5 = comm_fact.s5 + comm_fact.s6;                       \
-        out.s6 = comm_fact.s5 - comm_fact.s6;                       \
-        out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
-    })
-
-#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact)                                                    \
-    ({                                                                                             \
-        comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6;                                   \
-        comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5;                            \
-        comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6;                                    \
-        comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5;                            \
-        comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6;                                     \
-        comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5;                            \
-        out.s0       = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6;                \
-        out.s1       = comm_fact.s0 - comm_fact.s1;                                                \
-        out.s2       = comm_fact.s0 + comm_fact.s1;                                                \
-        out.s3       = comm_fact.s2 - comm_fact.s3;                                                \
-        out.s4       = comm_fact.s2 + comm_fact.s3;                                                \
-        out.s5       = comm_fact.s4 - comm_fact.s5;                                                \
-        out.s6       = comm_fact.s4 + comm_fact.s5;                                                \
-        out.s7       = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
-    })
-
-#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3 and the output tile is 2x2/2x1 or 1x2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x2_3x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int z              = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp0 = in_row0;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    tmp0 -= in_row2;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE out00 = tmp0.s0 - tmp0.s2;
-    DATA_TYPE out01 = tmp0.s1 + tmp0.s2;
-    DATA_TYPE out02 = tmp0.s2 - tmp0.s1;
-    DATA_TYPE out03 = tmp0.s1 - tmp0.s3;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp1 = in_row1 + in_row2;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp2 = in_row2 - in_row1;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp3 = in_row1 - in_row3;
-
-    DATA_TYPE out10 = tmp1.s0 - tmp1.s2;
-    DATA_TYPE out11 = tmp1.s1 + tmp1.s2;
-    DATA_TYPE out12 = tmp1.s2 - tmp1.s1;
-    DATA_TYPE out13 = tmp1.s1 - tmp1.s3;
-
-    DATA_TYPE out20 = tmp2.s0 - tmp2.s2;
-    DATA_TYPE out21 = tmp2.s1 + tmp2.s2;
-    DATA_TYPE out22 = tmp2.s2 - tmp2.s1;
-    DATA_TYPE out23 = tmp2.s1 - tmp2.s3;
-
-    DATA_TYPE out30 = tmp3.s0 - tmp3.s2;
-    DATA_TYPE out31 = tmp3.s1 + tmp3.s2;
-    DATA_TYPE out32 = tmp3.s2 - tmp3.s1;
-    DATA_TYPE out33 = tmp3.s1 - tmp3.s3;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out00; // in_row0.s0; out00;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out01; // in_row0.s1; out01;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out02; // in_row0.s2; out02;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out03; // in_row0.s3; out03;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z))  = out10;
-    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z))  = out11;
-    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z))  = out12;
-    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z))  = out13;
-    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out20;
-    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out21;
-    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out22;
-    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out23;
-    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out30;
-    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out31;
-    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out32;
-    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out33;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x3/3x1 or 1x3, the output tile is 2x2/2x1 or 1x2 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x2_3x3_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = (get_global_id(2) * 2) % SRC_DEPTH;
-    const int b = (get_global_id(2) * 2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int       z        = get_global_id(2) * 2;
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row0 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row1 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row2 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row3 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    src_addr += src_stride_z;
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                            *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row4 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row5 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row6 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    in_row7 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp0 = in_row0;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp4 = in_row4;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    tmp0 -= in_row2;
-    tmp4 -= in_row6;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out00 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s0 - tmp0.s2, tmp4.s0 - tmp4.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 + tmp0.s2, tmp4.s1 + tmp4.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out02 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s2 - tmp0.s1, tmp4.s2 - tmp4.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out03 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp0.s1 - tmp0.s3, tmp4.s1 - tmp4.s3);
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp1 = in_row1 + in_row2;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp2 = in_row2 - in_row1;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp3 = in_row1 - in_row3;
-
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp5 = in_row5 + in_row6;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp6 = in_row6 - in_row5;
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    tmp7 = in_row5 - in_row7;
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out10 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s0 - tmp1.s2, tmp5.s0 - tmp5.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out11 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 + tmp1.s2, tmp5.s1 + tmp5.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out12 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s2 - tmp1.s1, tmp5.s2 - tmp5.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out13 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp1.s1 - tmp1.s3, tmp5.s1 - tmp5.s3);
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out20 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s0 - tmp2.s2, tmp6.s0 - tmp6.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out21 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 + tmp2.s2, tmp6.s1 + tmp6.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out22 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s2 - tmp2.s1, tmp6.s2 - tmp6.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out23 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp2.s1 - tmp2.s3, tmp6.s1 - tmp6.s3);
-
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out30 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s0 - tmp3.s2, tmp7.s0 - tmp7.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out31 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 + tmp3.s2, tmp7.s1 + tmp7.s2);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out32 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s2 - tmp3.s1, tmp7.s2 - tmp7.s1);
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    out33 = (VEC_DATA_TYPE(DATA_TYPE, 2))(tmp3.s1 - tmp3.s3, tmp7.s1 - tmp7.s3);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    vstore2(out00, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z));
-    vstore2(out01, 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z));
-    vstore2(out02, 0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z));
-    vstore2(out03, 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z));
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    vstore2(out10, 0, (__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z));
-    vstore2(out11, 0, (__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z));
-    vstore2(out12, 0, (__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z));
-    vstore2(out13, 0, (__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z));
-    vstore2(out20, 0, (__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z));
-    vstore2(out21, 0, (__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z));
-    vstore2(out22, 0, (__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z));
-    vstore2(out23, 0, (__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z));
-    vstore2(out30, 0, (__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z));
-    vstore2(out31, 0, (__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z));
-    vstore2(out32, 0, (__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z));
-    vstore2(out33, 0, (__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z));
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the output tile is 4x4/4x1 or 1x4, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x4_3x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int       z        = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d01 = (VEC_DATA_TYPE(DATA_TYPE, 2))(*((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
-                                        *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)));
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row0
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d01                                        = vload2(2, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE out0 = 0.0f;
-    DATA_TYPE out1 = 0.0f;
-    DATA_TYPE out2 = 0.0f;
-    DATA_TYPE out3 = 0.0f;
-    DATA_TYPE out4 = 0.0f;
-    DATA_TYPE out5 = 0.0f;
-
-    // Channels [0, 5]: [out00, out01, out02, out03, out04, out05]
-    out0 += 16.0f * d00.s0 - 20.0f * d00.s2 + 4.0f * d01.s0;
-    out1 += -16.0f * d00.s1 - 16.0f * d00.s2 + 4.0f * d00.s3 + 4.0f * d01.s0;
-    out2 += 16.0f * d00.s1 - 16.0f * d00.s2 - 4.0f * d00.s3 + 4.0f * d01.s0;
-    out3 += -8.0f * d00.s1 - 4.0f * d00.s2 + 8.0f * d00.s3 + 4.0f * d01.s0;
-    out4 += 8.0f * d00.s1 - 4.0f * d00.s2 - 8.0f * d00.s3 + 4.0f * d01.s0;
-    out5 += 16.0f * d00.s1 - 20.0f * d00.s3 + 4.0f * d01.s1;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    // Row4
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d41 = vload2(2, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-
-    // k0, k1, k2, k3, k4, k5 are common terms for row0, row1, row2, row3 and row4
-    DATA_TYPE k0 = d41.s0;
-    DATA_TYPE k1 = d41.s0;
-    DATA_TYPE k2 = d41.s0;
-    DATA_TYPE k3 = d41.s0;
-    DATA_TYPE k4 = d41.s0;
-    DATA_TYPE k5 = 0.0f;
-
-    k0 += 4.0f * d40.s0 - 5.0f * d40.s2;
-    k1 += -4.0f * d40.s1 - 4.0f * d40.s2 + d40.s3;
-    k2 += 4.0f * d40.s1 - 4.0f * d40.s2 - d40.s3;
-    k3 += -2.0f * d40.s1 + 2.0f * d40.s3 - d40.s2;
-    k4 += 2.0f * d40.s1 - 2.0f * d40.s3 - d40.s2;
-    k5 += 4.0f * d40.s1 - 5.0f * d40.s3 + d41.s1;
-
-    out0 += k0;
-    out1 += k1;
-    out2 += k2;
-    out3 += k3;
-    out4 += k4;
-    out5 += k5;
-
-    // Row2
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d21 = vload2(2, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-
-    out0 += -20.0f * d20.s0 + 25.0f * d20.s2 - 5.0f * d21.s0;
-    out1 += +20.0f * d20.s1 + 20.0f * d20.s2 - 5.0f * d20.s3 - 5.0f * d21.s0;
-    out2 += -20.0f * d20.s1 + 20.0f * d20.s2 + 5.0f * d20.s3 - 5.0f * d21.s0;
-    out3 += +10.0f * d20.s1 + 5.0f * d20.s2 - 10.0f * d20.s3 - 5.0f * d21.s0;
-    out4 += -10.0f * d20.s1 + 5.0f * d20.s2 + 10.0f * d20.s3 - 5.0f * d21.s0;
-    out5 += -20.0f * d20.s1 + 25.0f * d20.s3 - 5.0f * d21.s1;
-#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Compute destination address
-#if defined(SRC_DEPTH)
-    __global DATA_TYPE *dst_addr = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w);
-#else  /* defined(SRC_DEPTH) */
-    __global DATA_TYPE *dst_addr               = (__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y);
-#endif /* defined(SRC_DEPTH) */
-
-    uint dst_plane_stride = dst_stride_z / sizeof(DATA_TYPE);
-
-    *(dst_addr) = out0;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out1;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out2;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out3;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out4;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out5;
-    dst_addr += dst_plane_stride;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    DATA_TYPE out6  = k0;
-    DATA_TYPE out7  = k1;
-    DATA_TYPE out8  = k2;
-    DATA_TYPE out9  = k3;
-    DATA_TYPE out10 = k4;
-    DATA_TYPE out11 = k5;
-    DATA_TYPE out12 = k0;
-    DATA_TYPE out13 = k1;
-    DATA_TYPE out14 = k2;
-    DATA_TYPE out15 = k3;
-    DATA_TYPE out16 = k4;
-    DATA_TYPE out17 = k5;
-    DATA_TYPE out18 = k0;
-    DATA_TYPE out19 = k1;
-    DATA_TYPE out20 = k2;
-    DATA_TYPE out21 = k3;
-    DATA_TYPE out22 = k4;
-    DATA_TYPE out23 = k5;
-    DATA_TYPE out24 = k0;
-    DATA_TYPE out25 = k1;
-    DATA_TYPE out26 = k2;
-    DATA_TYPE out27 = k3;
-    DATA_TYPE out28 = k4;
-    DATA_TYPE out29 = k5;
-
-    // Row1
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d11 = vload2(2, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-
-    // Row3
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d31 = vload2(2, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-
-    // Compute common parts for the channels between [6, 29]
-    // Channels [6, 11]:  [out10, out11, out12, out13, out14, out15]
-    // Channels [12, 17]: [out20, out21, out22, out23, out24, out25]
-    DATA_TYPE part0  = -16.0f * d20.s0 + 20.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part1  = 16.0f * d10.s0 - 20.0f * d10.s2 + 4.0f * d11.s0 - 4.0f * d30.s0 + 5.0f * d30.s2 - d31.s0;
-    DATA_TYPE part2  = 16.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part3  = 16.0f * d20.s1 - 4.0f * d20.s3;
-    DATA_TYPE part4  = 16.0f * d10.s2 - 4.0f * d11.s0 - 4.0f * d30.s2 + d31.s0;
-    DATA_TYPE part5  = 16.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + d30.s3;
-    DATA_TYPE part6  = 4.0f * d20.s2 - 4.0f * d21.s0;
-    DATA_TYPE part7  = 8.0f * d10.s1 - 8.0f * d10.s3 - 2.0f * d30.s1 + 2.0f * d30.s3;
-    DATA_TYPE part8  = 4.0f * d10.s2 - 4.0f * d11.s0 - d30.s2 + d31.s0;
-    DATA_TYPE part9  = 8.0f * d20.s1 - 8.0f * d20.s3;
-    DATA_TYPE part10 = -16.0f * d20.s1 + 20.0f * d20.s3 - 4.0f * d21.s1;
-    DATA_TYPE part11 = -16.0f * d10.s1 + 20.0f * d10.s3 - 4.0f * d11.s1 + 4.0f * d30.s1 - 5.0f * d30.s3 + d31.s1;
-
-    // Channels [18, 23]: [out30, out31, out32, out33, out34, out35]
-    // Channels [24, 29]: [out40, out41, out42, out43, out44, out45]
-    DATA_TYPE part12 = 8.0f * d10.s0 - 10.0f * d10.s2 + 2.0f * d11.s0 - 8.0f * d30.s0 + 10.0f * d30.s2 - 2.0f * d31.s0;
-    DATA_TYPE part13 = part0 * 0.25f; // -4.0f * d20.s0 + 5.0f * d20.s2 - d21.s0
-    DATA_TYPE part14 = part2 * 0.25f; // 4.0f * d20.s2 - d21.s0
-    DATA_TYPE part15 = 8.0f * d10.s1 - 2.0f * d10.s3 - 8.0f * d30.s1 + 2.0f * d30.s3;
-    DATA_TYPE part16 = 8.0f * d10.s2 - 2.0f * d11.s0 - 8.0f * d30.s2 + 2.0f * d31.s0;
-    DATA_TYPE part17 = part3 * 0.25f; // 4.0f * d20.s1 - d20.s3
-    DATA_TYPE part18 = part6 * 0.25f; // d20.s2 - d21.s0
-    DATA_TYPE part19 = 4.0f * d10.s1 - 4.0f * d10.s3 - 4.0f * d30.s1 + 4.0f * d30.s3;
-    DATA_TYPE part20 = 2.0f * d10.s2 - 2.0f * d11.s0 - 2.0f * d30.s2 + 2.0f * d31.s0;
-    DATA_TYPE part21 = part9 * 0.25f;                                                 // 2.0f * (d20.s1 - d20.s3)
-    DATA_TYPE part22 = part10 * 0.25f;                                                // - 4.0f * d20.s1 + 5.0f * d20.s3 - d21.s1
-    DATA_TYPE part23 = part11 * 0.5f + 6.0f * d30.s1 - 7.5f * d30.s3 + 1.5f * d31.s1; // - 8.0f * d10.s1 + 10.0f * d10.s3 - 2.0f * d11.s1 + 8.0f * d30.s1 - 10.0f * d30.s3 + 2.0f * d31.s1;
-
-    out6 += part0 - part1;
-    out12 += part0 + part1;
-    out7 += part2 + part3 + part4 + part5;
-    out8 += part2 - part3 + part4 - part5;
-    out13 += part2 + part3 - part4 - part5;
-    out14 += part2 - part3 - part4 + part5;
-    out9 += part6 + part7 + part8 + part9;
-    out10 += part6 - part7 + part8 - part9;
-    out15 += part6 - part7 - part8 + part9;
-    out16 += part6 + part7 - part8 - part9;
-    out11 += part10 + part11;
-    out17 += part10 - part11;
-
-    out18 += part13 - part12;
-    out24 += part13 + part12;
-    out19 += part14 + part15 + part16 + part17;
-    out20 += part14 - part15 + part16 - part17;
-    out25 += part14 - part15 - part16 + part17;
-    out26 += part14 + part15 - part16 - part17;
-    out21 += part18 + part19 + part20 + part21;
-    out22 += part18 - part19 + part20 - part21;
-    out27 += part18 - part19 - part20 + part21;
-    out28 += part18 + part19 - part20 - part21;
-    out23 += part22 + part23;
-    out29 += part22 - part23;
-
-    *(dst_addr) = out6;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out7;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out8;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out9;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out10;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out11;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out12;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out13;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out14;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out15;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out16;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out17;
-    dst_addr += dst_plane_stride;
-
-    *(dst_addr) = out18;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out19;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out20;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out21;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out22;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out23;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out24;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out25;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out26;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out27;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out28;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out29;
-    dst_addr += dst_plane_stride;
-
-    // Row5
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    d50 = vload4(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    VEC_DATA_TYPE(DATA_TYPE, 2)
-    d51 = vload2(2, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-
-    // Channels [30, 35]
-    out0 = 16.0f * d10.s0 - 20.0f * d10.s2 - 20.0f * d30.s0 + 25.0f * d30.s2 + 4.0f * d50.s0 - 5.0f * d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out1 = -16.0f * d10.s1 - 16.0f * d10.s2 + 4.0f * d10.s3 + 20.0f * d30.s1 + 20.0f * d30.s2 - 5.0f * d30.s3 - 4.0f * d50.s1 - 4.0f * d50.s2 + d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out2 = 16.0f * d10.s1 - 16.0f * d10.s2 - 4.0f * d10.s3 - 20.0f * d30.s1 + 20.0f * d30.s2 + 5.0f * d30.s3 + 4.0f * d50.s1 - 4.0f * d50.s2 - d50.s3 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out3 = -8.0f * d10.s1 - 4.0f * d10.s2 + 8.0f * d10.s3 + 10.0f * d30.s1 - 10.0f * d30.s3 + 5.0f * d30.s2 - 2.0f * d50.s1 + 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out4 = 8.0f * d10.s1 - 4.0f * d10.s2 - 8.0f * d10.s3 - 10.0f * d30.s1 + 5.0f * d30.s2 + 10.0f * d30.s3 + 2.0f * d50.s1 - 2.0f * d50.s3 - d50.s2 + d51.s0 + 4.0f * d11.s0 - 5.0f * d31.s0;
-    out5 = 16.0f * d10.s1 - 20.0f * d10.s3 + 4.0f * d11.s1 - 20.0f * d30.s1 + 25.0f * d30.s3 - 5.0f * d31.s1 + 4.0f * d50.s1 - 5.0f * d50.s3 + d51.s1;
-
-    *(dst_addr) = out0;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out1;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out2;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out3;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out4;
-    dst_addr += dst_plane_stride;
-    *(dst_addr) = out5;
-    dst_addr += dst_plane_stride;
-#endif // #if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd input transform 5x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x5, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    const int x = get_global_id(0);
-    const int y = get_global_id(1);
-#if defined(SRC_DEPTH)
-    const int z = get_global_id(2) % SRC_DEPTH;
-    const int b = get_global_id(2) / SRC_DEPTH;
-#else  /* defined(SRC_DEPTH) */
-    const int                                z = get_global_id(2);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute input address
-#if defined(SRC_DEPTH)
-    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z + b * src_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *src_addr                   = src_ptr + src_offset_first_element_in_bytes + x * OUTPUT_TILE_W * sizeof(DATA_TYPE) + y * OUTPUT_TILE_H * src_stride_y + z * src_stride_z;
-#endif /* defined(SRC_DEPTH) */
-    src_addr = src_addr - ((int)PAD_LEFT * sizeof(DATA_TYPE)) - ((int)PAD_TOP * src_stride_y);
-
-    // Load input tile
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = (VEC_DATA_TYPE(DATA_TYPE, 8))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 5 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 6 * src_stride_y)),
-                                                                              *((__global DATA_TYPE *)(src_addr + 7 * src_stride_y)));
-#else                                            // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row0 = vload8(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row1 = vload8(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row2 = vload8(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row3 = vload8(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row4 = vload8(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row5 = vload8(0, (__global DATA_TYPE *)(src_addr + 5 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row6 = vload8(0, (__global DATA_TYPE *)(src_addr + 6 * src_stride_y));
-    const VEC_DATA_TYPE(DATA_TYPE, 8) in_row7 = vload8(0, (__global DATA_TYPE *)(src_addr + 7 * src_stride_y));
-#endif                                           // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Calculate common factors for intermediate tensor
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    tmp0 = in_row0;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact0 = 0.0f;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    comm_fact0 += in_row2 + in_row6 - (DATA_TYPE)4.25f * in_row4;
-    tmp0 += -in_row6 + (DATA_TYPE)5.25f * in_row4 - (DATA_TYPE)5.25f * in_row2;
-
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact1 = in_row1 + in_row5 - (DATA_TYPE)4.25f * in_row3;
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    comm_fact2 = (DATA_TYPE)0.25f * in_row2 - (DATA_TYPE)1.25f * in_row4 + in_row6;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp1 = comm_fact0 + comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp2 = comm_fact0 - comm_fact1;
-
-    comm_fact0 = (DATA_TYPE)2.5f * in_row3;
-    comm_fact1 = (DATA_TYPE)0.5f * in_row1 - comm_fact0 + (DATA_TYPE)2.0f * in_row5;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp3 = comm_fact1 + comm_fact2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp4 = comm_fact2 - comm_fact1;
-
-    comm_fact1 = (DATA_TYPE)2.0f * in_row1 - comm_fact0 + (DATA_TYPE)0.5f * in_row5;
-    comm_fact2 = (DATA_TYPE)4.0f * in_row2 - (DATA_TYPE)5.0f * in_row4 + in_row6;
-
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp5 = comm_fact1 + comm_fact2;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp6 = comm_fact2 - comm_fact1;
-    const VEC_DATA_TYPE(DATA_TYPE, 8) tmp7 = in_row7 - in_row1 + (DATA_TYPE)5.25f * in_row3 - (DATA_TYPE)5.25f * in_row5;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Calculate output rows (reuse comm_fact0 vector)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out0;
-
-    OUTPUT_ROW_4x4_5x5(out0, tmp0, comm_fact0);
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 8)
-    out1, out2, out3, out4, out5, out6, out7;
-
-    OUTPUT_ROW_4x4_5x5(out1, tmp1, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out2, tmp2, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out3, tmp3, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out4, tmp4, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out5, tmp5, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out6, tmp6, comm_fact0);
-    OUTPUT_ROW_4x4_5x5(out7, tmp7, comm_fact0);
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    // Store values across the channels
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y + b * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + z * sizeof(DATA_TYPE) + (x + y * (int)NUM_TILES_X) * dst_stride_y;
-#endif /* defined(SRC_DEPTH) */
-
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z)) = out0.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z)) = out0.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z)) = out0.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z)) = out0.s3;
-    *((__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z)) = out0.s4;
-    *((__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z)) = out0.s5;
-    *((__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z)) = out0.s6;
-    *((__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z)) = out0.s7;
-
-#if !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z))  = out1.s0;
-    *((__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z))  = out1.s1;
-    *((__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z)) = out1.s2;
-    *((__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z)) = out1.s3;
-    *((__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z)) = out1.s4;
-    *((__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z)) = out1.s5;
-    *((__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z)) = out1.s6;
-    *((__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z)) = out1.s7;
-    *((__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z)) = out2.s0;
-    *((__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z)) = out2.s1;
-    *((__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z)) = out2.s2;
-    *((__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z)) = out2.s3;
-    *((__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z)) = out2.s4;
-    *((__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z)) = out2.s5;
-    *((__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z)) = out2.s6;
-    *((__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z)) = out2.s7;
-    *((__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z)) = out3.s0;
-    *((__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z)) = out3.s1;
-    *((__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z)) = out3.s2;
-    *((__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z)) = out3.s3;
-    *((__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z)) = out3.s4;
-    *((__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z)) = out3.s5;
-    *((__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z)) = out3.s6;
-    *((__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z)) = out3.s7;
-    *((__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z)) = out4.s0;
-    *((__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z)) = out4.s1;
-    *((__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z)) = out4.s2;
-    *((__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z)) = out4.s3;
-    *((__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z)) = out4.s4;
-    *((__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z)) = out4.s5;
-    *((__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z)) = out4.s6;
-    *((__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z)) = out4.s7;
-    *((__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z)) = out5.s0;
-    *((__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z)) = out5.s1;
-    *((__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z)) = out5.s2;
-    *((__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z)) = out5.s3;
-    *((__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z)) = out5.s4;
-    *((__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z)) = out5.s5;
-    *((__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z)) = out5.s6;
-    *((__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z)) = out5.s7;
-    *((__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z)) = out6.s0;
-    *((__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z)) = out6.s1;
-    *((__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z)) = out6.s2;
-    *((__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z)) = out6.s3;
-    *((__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z)) = out6.s4;
-    *((__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z)) = out6.s5;
-    *((__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z)) = out6.s6;
-    *((__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z)) = out6.s7;
-    *((__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z)) = out7.s0;
-    *((__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z)) = out7.s1;
-    *((__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z)) = out7.s2;
-    *((__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z)) = out7.s3;
-    *((__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z)) = out7.s4;
-    *((__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z)) = out7.s5;
-    *((__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z)) = out7.s6;
-    *((__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z)) = out7.s7;
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 6, 1, in);
-    TILE(DATA_TYPE, 6, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    TILE(DATA_TYPE, 6, 1, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v *= 4.0f;
-    })
-
-    com[0].v = in[2].v - 4.f * in[0].v;
-    com[1].v = in[3].v - 4.f * in[1].v;
-    com[2].v = in[4].v - 4.f * in[2].v;
-    com[3].v = in[5].v - 4.f * in[3].v;
-    com[4].v = in[3].v - in[1].v;
-    com[4].v = com[4].v + com[4].v;
-    com[5].v = in[4].v - in[2].v;
-
-    out[0].v = com[2].v - com[0].v;
-    out[1].v = com[2].v + com[1].v;
-    out[2].v = com[2].v - com[1].v;
-    out[3].v = com[5].v + com[4].v;
-    out[4].v = com[5].v - com[4].v;
-    out[5].v = com[3].v - com[1].v;
-
-    TILE(uint, 6, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 6;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 36, 1, in);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 6, 1, com);
-    TILE(DATA_TYPE, 36, 1, tmp);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        com[0].v         = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
-        com[1].v         = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
-        com[2].v         = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
-        com[3].v         = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
-        com[4].v         = in[3 * 6 + i].v - in[1 * 6 + i].v;
-        com[4].v         = com[4].v + com[4].v;
-        com[5].v         = in[4 * 6 + i].v - in[2 * 6 + i].v;
-        tmp[i + 0 * 6].v = com[2].v - com[0].v;
-        tmp[i + 1 * 6].v = com[2].v + com[1].v;
-        tmp[i + 2 * 6].v = com[2].v - com[1].v;
-        tmp[i + 3 * 6].v = com[5].v + com[4].v;
-        tmp[i + 4 * 6].v = com[5].v - com[4].v;
-        tmp[i + 5 * 6].v = com[3].v - com[1].v;
-    })
-
-    TILE(DATA_TYPE, 36, 1, out);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        com[0].v         = tmp[i * 6 + 2].v - 4.f * tmp[i * 6 + 0].v;
-        com[1].v         = tmp[i * 6 + 3].v - 4.f * tmp[i * 6 + 1].v;
-        com[2].v         = tmp[i * 6 + 4].v - 4.f * tmp[i * 6 + 2].v;
-        com[3].v         = tmp[i * 6 + 5].v - 4.f * tmp[i * 6 + 3].v;
-        com[4].v         = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
-        com[4].v         = com[4].v + com[4].v;
-        com[5].v         = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
-        out[i * 6 + 0].v = com[2].v - com[0].v;
-        out[i * 6 + 1].v = com[2].v + com[1].v;
-        out[i * 6 + 2].v = com[2].v - com[1].v;
-        out[i * 6 + 3].v = com[5].v + com[4].v;
-        out[i * 6 + 4].v = com[5].v - com[4].v;
-        out[i * 6 + 5].v = com[3].v - com[1].v;
-    })
-
-    // Compute destination address
-    TILE(uint, 36, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 36;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 8, 1, in);
-    TILE(DATA_TYPE, 8, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    TILE(DATA_TYPE, 1, 8, com);
-
-    com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
-    com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
-    com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
-    com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
-    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
-    com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
-    out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
-    out[1].s[0] = com[0].s[0] + com[0].s[1];
-    out[2].s[0] = com[0].s[0] - com[0].s[1];
-    out[3].s[0] = com[0].s[3] + com[0].s[2];
-    out[4].s[0] = com[0].s[3] - com[0].s[2];
-    out[5].s[0] = com[0].s[4] + com[0].s[5];
-    out[6].s[0] = com[0].s[4] - com[0].s[5];
-    out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
-
-    TILE(uint, 8, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, 1, in);
-    TILE(DATA_TYPE, 64, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 8, 8, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                                    // x
-        com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];                                    // x
-        com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];                 // x
-        com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
-        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
-        com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
-        com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
-    })
-
-    TILE(DATA_TYPE, 8, 8, tmp);
-    tmp[0].v = com[6].v;
-    tmp[1].v = com[0].v + com[1].v;
-    tmp[2].v = com[0].v - com[1].v;
-    tmp[3].v = com[2].v + com[3].v;
-    tmp[4].v = com[2].v - com[3].v;
-    tmp[5].v = com[4].v + com[5].v;
-    tmp[6].v = com[4].v - com[5].v;
-    tmp[7].v = com[7].v;
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[0]         = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[1]         = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
-        com[0].s[2]         = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
-        com[0].s[3]         = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[5]         = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
-        out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
-        out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
-        out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
-        out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
-        out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
-        out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
-        out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
-        out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
-    })
-
-    TILE(uint, 64, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
-    const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-    // All the tensor dimensions are passed at compile time.
-    // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
-    int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-    x -= PAD_LEFT;
-    y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 8, 1, in);
-    TILE(DATA_TYPE, 8, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else  // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-    T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v *= (DATA_TYPE) - 36.0f;
-    })
-
-    TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
-
-    com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
-    com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
-    com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
-    com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
-    com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
-    com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
-    out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
-    out[1].s[0] = com[0].s[0] - com[0].s[1];
-    out[2].s[0] = com[0].s[0] + com[0].s[1];
-    out[3].s[0] = com[0].s[2] - com[0].s[3];
-    out[4].s[0] = com[0].s[2] + com[0].s[3];
-    out[5].s[0] = com[0].s[4] - com[0].s[5];
-    out[6].s[0] = com[0].s[4] + com[0].s[5];
-    out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
-
-    TILE(uint, 8, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, 1, in);
-    TILE(DATA_TYPE, 64, 1, out);
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the tile from a NHWC tensor
-    T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
-    TILE(DATA_TYPE, 8, 8, com);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
-        com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
-        com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
-        com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
-        com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
-        com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
-    })
-
-    TILE(DATA_TYPE, 8, 8, tmp);
-    tmp[0].v = com[6].v;
-    tmp[1].v = com[0].v - com[1].v;
-    tmp[2].v = com[0].v + com[1].v;
-    tmp[3].v = com[2].v - com[3].v;
-    tmp[4].v = com[2].v + com[3].v;
-    tmp[5].v = com[4].v - com[5].v;
-    tmp[6].v = com[4].v + com[5].v;
-    tmp[7].v = com[7].v;
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        com[0].s[0]         = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[1]         = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
-        com[0].s[2]         = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[3]         = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
-        com[0].s[4]         = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
-        com[0].s[5]         = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
-        out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
-        out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
-        out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
-        out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
-        out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
-        out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
-        out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
-        out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
-    })
-
-    TILE(uint, 64, 1, dst_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
-        dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
-    })
-
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER))
-{
-    winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_stride_w,
-                                                 src_step_w,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_stride_w,
-                                                 dst_step_w,
-                                                 dst_offset_first_element_in_bytes);
-}
-#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x1_3x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1, the output tile is 2x1 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_2x1_3x1_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x1_3x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 when the data layout is NCHW
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_4x1_5x1_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x2_1x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3, the output tile is 1x2 and the number of channels is multiple of 2
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x2_1x3_stepz2_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_2x2_3x3_stepz2_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x4_1x3_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_3x3_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-
-/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4
- *
- * @note The number of tiles in the x axis must be passed at compile time using -DNUM_TILES_X (i.e.-DNUM_TILES_X=5).
- * @note The pad left and pad top must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (i.e.-DPAD_LEFT=1 and -DPAD_TOP=0).
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr                           Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x                      Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y                      Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z                        src_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_ptr                           Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z                      Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z                        dst_stride_z * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_stride_w                      Stride of the destination tensor in W dimension (in bytes)
- */
-__kernel void winograd_input_transform_1x4_1x5_stepz1_nchw(
-    TENSOR3D_DECLARATION(src),
-    TENSOR3D_DECLARATION(dst),
-    uint src_stride_w,
-    uint dst_stride_w)
-{
-    winograd_input_transform_4x4_5x5_stepz1_nchw(src_ptr,
-                                                 src_stride_x,
-                                                 src_step_x,
-                                                 src_stride_y,
-                                                 src_step_y,
-                                                 src_stride_z,
-                                                 src_step_z,
-                                                 src_offset_first_element_in_bytes,
-                                                 dst_ptr,
-                                                 dst_stride_x,
-                                                 dst_step_x,
-                                                 dst_stride_y,
-                                                 dst_step_y,
-                                                 dst_stride_z,
-                                                 dst_step_z,
-                                                 dst_offset_first_element_in_bytes,
-                                                 src_stride_w,
-                                                 dst_stride_w);
-}
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/winograd_output_transform.cl
deleted file mode 100644
index 6a3e6d3346..0000000000
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ /dev/null
@@ -1,2063 +0,0 @@
-/*
- * Copyright (c) 2018-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "activation_float_helpers.h"
-#include "helpers.h"
-#include "tile_helpers.h"
-
-#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 3x3/3x1 or 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively.
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. Accepted values are -DVEC_SIZE=2 (for output_tile_size 2x2, 2x1, 1x2) and -DVEC_SIZE=4 (for output_tile_size 4x4, 4x1, 1x4)
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x2_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 2x2/2x1 or 1x2 tile accordingly with the filter size
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the 16 or 4 channels to compose the 4x4 or 4x1 tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute the 2x1 or 1x2 output tile
-    // out00 = d00 + d01 + d02
-    // out01 = d01 - d02 - d03
-
-    float out00 = d00 + d01 + d02;
-    float out01 = d01 - d02 - d03;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-
-    // Compute the 2x2 output tile
-    float k0 = d01 + d11 + d21;
-    float k1 = d02 + d12 + d22;
-    float k2 = d11 - d21 - d31;
-    float k3 = d12 - d22 - d32;
-
-    // out00 = d00 + d10 + d20 + d01 + d11 + d21 + d02 + d12 + d22
-    // out01 = d01 + d11 + d21 - (d02 + d12 + d22) - (d03 + d13 + d23)
-    // out10 = d10 - d20 - d30 + (d11 - d21 - d31) + (d12 - d22 - d32)
-    // out11 = d11 - d21 - d31 - (d12 - d22 - d32) - (d13 - d23 - d33)
-
-    float out00 = d10;
-    float out01 = -d13;
-    float out10 = d10;
-    float out11 = -d13;
-
-    out00 += d00 + d20 + k0 + k1;
-    out01 += k0 - k1 - (d03 + d23);
-    out10 += -d20 - d30 + k2 + k3;
-    out11 += k2 - k3 + d23 + d33;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (float)b;
-    out01 += (float)b;
-#endif // defined(HAS_BIAS)
-
-    // Get output address
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    const VEC_DATA_TYPE(DATA_TYPE, 2)
-    out0_dt                                            = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL);
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out00, out01), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(HAS_BIAS)
-    // Add bias
-    out10 += (DATA_TYPE)b;
-    out11 += (DATA_TYPE)b;
-#endif // defined(HAS_BIAS)
-    vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, CONVERT((VEC_DATA_TYPE(float, 2))(out10, out11), VEC_DATA_TYPE(DATA_TYPE, 2)), A_VAL, B_VAL), 0,
-            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x2_7x7_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-    int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    TILE(DATA_TYPE, 8, N0, in);
-    TILE(DATA_TYPE, 2, N0, out);
-    TILE(uint, 8, 1, src_indirect_y);
-
-    // Calculate the indirect Y for the source tensor
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 8 channels to compose the 8x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // Compute out0 and out01
-    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
-    out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
-#endif // defined(HAS_BIAS)
-
-    T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 2, 1, dst_indirect_y);
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 2,
-    {
-        int y_c              = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 2,
-    {
-        int x_c              = min(x_out + xk, ((int)_IDST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 64, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(DATA_TYPE, 16, N0, tmp);
-    TILE(uint, 64, 1, src_indirect_y);
-
-    // Calculate the indirect Y for the source tensor
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 64 channels to compose the 8x8 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        tmp[i * 2].v     = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
-        tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
-    })
-
-    // Compute the 2x2 output tile
-    LOOP_UNROLLING(int, i, 0, 1, 2,
-    {
-        out[i * 2].v     = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
-        out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
-    })
-
-#if defined(HAS_BIAS)
-    // Add bias
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // defined(HAS_BIAS)
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 2,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 2,
-        {
-            int x_c                       = min(x_out + xk, ((int)_IDST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 2].v = x_c + y_c * _IDST_WIDTH;
-            dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 3x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_3x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the channels to compose the 6x6 or 6x1 tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04;
-    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;
-    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;
-    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-
-    // Compute out00, out01, out02 and out03
-    float out00 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out01 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out02 = (float)d01 + (float)d21 + (float)d41 + (float)d11 + (float)d31;
-    float out03 = (float)d01 + d21 + (float)d41 + (float)d11 + (float)d31;
-
-    float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;
-    float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;
-
-    out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;
-    out01 += k1 - d02 - d12 - d22 - d32 - d42;
-    out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;
-    out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;
-
-    // Compute out10, out11, out12 and out13
-    float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-    float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;
-
-    k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;
-    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;
-
-    out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;
-    out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;
-    out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;
-    out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;
-
-    // Compute out20, out21, out22 and out23
-    float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-    float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;
-
-    k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;
-    k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;
-
-    out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;
-    out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;
-    out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;
-    out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;
-
-    // Compute out30, out31, out32 and out33
-    float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-    float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;
-
-    k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;
-    k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;
-
-    out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;
-    out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;
-    out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;
-    out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (float)b;
-    out01 += (float)b;
-    out02 += (float)b;
-    out03 += (float)b;
-#endif // defined(HAS_BIAS)
-
-    // Get output address
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Store the output tile
-    const VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4(out0_dt, 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(HAS_BIAS)
-    // Add bias
-    out10 += (float)b;
-    out11 += (float)b;
-    out12 += (float)b;
-    out13 += (float)b;
-
-    out20 += (float)b;
-    out21 += (float)b;
-    out22 += (float)b;
-    out23 += (float)b;
-
-    out30 += (float)b;
-    out31 += (float)b;
-    out32 += (float)b;
-    out33 += (float)b;
-#endif // defined(HAS_BIAS)
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out10, out11, out12, out13), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
-            (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out20, out21, out22, out23), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
-            (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out30, out31, out32, out33), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)), 0,
-            (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in]  dst_size                          Size of the destination tensor, minus the last padding
- */
-__kernel void winograd_output_transform_4x4_3x3_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    TILE(DATA_TYPE, 6, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(uint, 6, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // Compute out00, out01, out02 and out03
-    out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
-    out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
-    out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
-    out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
-        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 4,
-    {
-        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
-        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Calculate the indirect Y for the source tensor
-    TILE(DATA_TYPE, 36, N0, in);
-    TILE(DATA_TYPE, 4, N0, tmp);
-    TILE(uint, 36, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 36,
-    {
-        in[i].v = 0;
-    })
-
-    // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    LOOP_UNROLLING(int, i, 0, 1, 6,
-    {
-        tmp[0].v     = in[6 + i].v + in[12 + i].v;
-        tmp[1].v     = in[6 + i].v - in[12 + i].v;
-        tmp[2].v     = in[18 + i].v + in[24 + i].v;
-        tmp[3].v     = in[18 + i].v - in[24 + i].v;
-        tmp[3].v     = tmp[3].v + tmp[3].v;
-        in[i].v      = in[i].v + tmp[0].v + tmp[2].v;
-        in[6 + i].v  = tmp[3].v + tmp[1].v;
-        in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
-        in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
-    })
-
-    // Compute the output tile
-    TILE(DATA_TYPE, 16, N0, out);
-
-    LOOP_UNROLLING(int, i, 0, 1, 4,
-    {
-        tmp[0].v         = in[6 * i + 1].v + in[6 * i + 2].v;
-        tmp[1].v         = in[6 * i + 1].v - in[6 * i + 2].v;
-        tmp[2].v         = in[6 * i + 3].v + in[6 * i + 4].v;
-        tmp[3].v         = in[6 * i + 3].v - in[6 * i + 4].v;
-        tmp[3].v         = tmp[3].v + tmp[3].v;
-        out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
-        out[4 * i + 1].v = tmp[3].v + tmp[1].v;
-        out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
-        out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
-    })
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 16, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 4,
-        {
-            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
-            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact)  \
-    ({                                                                   \
-        comm_fact.s0 = d1 + d2;                                          \
-        comm_fact.s1 = d3 + d4;                                          \
-        comm_fact.s2 = d5 + d6;                                          \
-        \
-        col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0;  \
-        col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \
-        \
-        comm_fact.s0 = d1 - d2;                                          \
-        comm_fact.s1 = d3 - d4;                                          \
-        comm_fact.s2 = d5 - d6;                                          \
-        \
-        col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \
-        col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7;  \
-    })
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_5x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    // Each thread stores a 4x4/4x1 or 1x4 tile
-#if defined(SRC_DEPTH)
-    Tensor4D       src             = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DEPTH);
-    const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-#else  /* defined(SRC_DEPTH) */
-
-    Tensor3D       src             = CONVERT_TO_TENSOR3D_STRUCT(src);
-    const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);
-#endif /* defined(SRC_DEPTH) */
-
-    // Compute output address
-    int y_in  = get_global_id(1);
-    int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;
-    int z_out = get_global_id(0);
-#if defined(SRC_DEPTH)
-    int batch = get_global_id(2) / SRC_DEPTH;
-#endif /* defined(SRC_DEPTH) */
-
-#if defined(SRC_DEPTH)
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z + batch * dst_stride_w;
-#else  /* defined(SRC_DEPTH) */
-
-    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(DATA_TYPE) + y_out * dst_stride_y + z_out * dst_stride_z;
-#endif /* defined(SRC_DEPTH) */
-
-    // Load the values across the channels to compose the input tile
-    DATA_TYPE d00 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_z));
-    DATA_TYPE d01 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_z));
-    DATA_TYPE d02 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_z));
-    DATA_TYPE d03 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_z));
-    DATA_TYPE d04 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_z));
-    DATA_TYPE d05 = *((__global DATA_TYPE *)(src_addr + 5 * src_stride_z));
-    DATA_TYPE d06 = *((__global DATA_TYPE *)(src_addr + 6 * src_stride_z));
-    DATA_TYPE d07 = *((__global DATA_TYPE *)(src_addr + 7 * src_stride_z));
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Compute out00, out01, out02 and out03
-    float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;
-    float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;
-    float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;
-    float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out00 += (DATA_TYPE)b;
-    out01 += (DATA_TYPE)b;
-    out02 += (DATA_TYPE)b;
-    out03 += (DATA_TYPE)b;
-#endif // defined(HAS_BIAS)
-
-    // Store the output tile
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    VEC_DATA_TYPE(DATA_TYPE, 4)
-    out0_dt = CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL,
-                                 B_VAL),
-                      VEC_DATA_TYPE(DATA_TYPE, 4));
-    *((__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y)) = out0_dt.s0;
-    *((__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)) = out0_dt.s1;
-    *((__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y)) = out0_dt.s2;
-    *((__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)) = out0_dt.s3;
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out00, out01, out02, out03), A_VAL, B_VAL), VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr));
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    DATA_TYPE d10 = *((__global DATA_TYPE *)(src_addr + 8 * src_stride_z));
-    DATA_TYPE d11 = *((__global DATA_TYPE *)(src_addr + 9 * src_stride_z));
-    DATA_TYPE d12 = *((__global DATA_TYPE *)(src_addr + 10 * src_stride_z));
-    DATA_TYPE d13 = *((__global DATA_TYPE *)(src_addr + 11 * src_stride_z));
-    DATA_TYPE d14 = *((__global DATA_TYPE *)(src_addr + 12 * src_stride_z));
-    DATA_TYPE d15 = *((__global DATA_TYPE *)(src_addr + 13 * src_stride_z));
-    DATA_TYPE d16 = *((__global DATA_TYPE *)(src_addr + 14 * src_stride_z));
-    DATA_TYPE d17 = *((__global DATA_TYPE *)(src_addr + 15 * src_stride_z));
-
-    DATA_TYPE d20 = *((__global DATA_TYPE *)(src_addr + 16 * src_stride_z));
-    DATA_TYPE d21 = *((__global DATA_TYPE *)(src_addr + 17 * src_stride_z));
-    DATA_TYPE d22 = *((__global DATA_TYPE *)(src_addr + 18 * src_stride_z));
-    DATA_TYPE d23 = *((__global DATA_TYPE *)(src_addr + 19 * src_stride_z));
-    DATA_TYPE d24 = *((__global DATA_TYPE *)(src_addr + 20 * src_stride_z));
-    DATA_TYPE d25 = *((__global DATA_TYPE *)(src_addr + 21 * src_stride_z));
-    DATA_TYPE d26 = *((__global DATA_TYPE *)(src_addr + 22 * src_stride_z));
-    DATA_TYPE d27 = *((__global DATA_TYPE *)(src_addr + 23 * src_stride_z));
-
-    DATA_TYPE d30 = *((__global DATA_TYPE *)(src_addr + 24 * src_stride_z));
-    DATA_TYPE d31 = *((__global DATA_TYPE *)(src_addr + 25 * src_stride_z));
-    DATA_TYPE d32 = *((__global DATA_TYPE *)(src_addr + 26 * src_stride_z));
-    DATA_TYPE d33 = *((__global DATA_TYPE *)(src_addr + 27 * src_stride_z));
-    DATA_TYPE d34 = *((__global DATA_TYPE *)(src_addr + 28 * src_stride_z));
-    DATA_TYPE d35 = *((__global DATA_TYPE *)(src_addr + 29 * src_stride_z));
-    DATA_TYPE d36 = *((__global DATA_TYPE *)(src_addr + 30 * src_stride_z));
-    DATA_TYPE d37 = *((__global DATA_TYPE *)(src_addr + 31 * src_stride_z));
-
-    DATA_TYPE d40 = *((__global DATA_TYPE *)(src_addr + 32 * src_stride_z));
-    DATA_TYPE d41 = *((__global DATA_TYPE *)(src_addr + 33 * src_stride_z));
-    DATA_TYPE d42 = *((__global DATA_TYPE *)(src_addr + 34 * src_stride_z));
-    DATA_TYPE d43 = *((__global DATA_TYPE *)(src_addr + 35 * src_stride_z));
-    DATA_TYPE d44 = *((__global DATA_TYPE *)(src_addr + 36 * src_stride_z));
-    DATA_TYPE d45 = *((__global DATA_TYPE *)(src_addr + 37 * src_stride_z));
-    DATA_TYPE d46 = *((__global DATA_TYPE *)(src_addr + 38 * src_stride_z));
-    DATA_TYPE d47 = *((__global DATA_TYPE *)(src_addr + 39 * src_stride_z));
-
-    DATA_TYPE d50 = *((__global DATA_TYPE *)(src_addr + 40 * src_stride_z));
-    DATA_TYPE d51 = *((__global DATA_TYPE *)(src_addr + 41 * src_stride_z));
-    DATA_TYPE d52 = *((__global DATA_TYPE *)(src_addr + 42 * src_stride_z));
-    DATA_TYPE d53 = *((__global DATA_TYPE *)(src_addr + 43 * src_stride_z));
-    DATA_TYPE d54 = *((__global DATA_TYPE *)(src_addr + 44 * src_stride_z));
-    DATA_TYPE d55 = *((__global DATA_TYPE *)(src_addr + 45 * src_stride_z));
-    DATA_TYPE d56 = *((__global DATA_TYPE *)(src_addr + 46 * src_stride_z));
-    DATA_TYPE d57 = *((__global DATA_TYPE *)(src_addr + 47 * src_stride_z));
-
-    DATA_TYPE d60 = *((__global DATA_TYPE *)(src_addr + 48 * src_stride_z));
-    DATA_TYPE d61 = *((__global DATA_TYPE *)(src_addr + 49 * src_stride_z));
-    DATA_TYPE d62 = *((__global DATA_TYPE *)(src_addr + 50 * src_stride_z));
-    DATA_TYPE d63 = *((__global DATA_TYPE *)(src_addr + 51 * src_stride_z));
-    DATA_TYPE d64 = *((__global DATA_TYPE *)(src_addr + 52 * src_stride_z));
-    DATA_TYPE d65 = *((__global DATA_TYPE *)(src_addr + 53 * src_stride_z));
-    DATA_TYPE d66 = *((__global DATA_TYPE *)(src_addr + 54 * src_stride_z));
-    DATA_TYPE d67 = *((__global DATA_TYPE *)(src_addr + 55 * src_stride_z));
-
-    DATA_TYPE d70 = *((__global DATA_TYPE *)(src_addr + 56 * src_stride_z));
-    DATA_TYPE d71 = *((__global DATA_TYPE *)(src_addr + 57 * src_stride_z));
-    DATA_TYPE d72 = *((__global DATA_TYPE *)(src_addr + 58 * src_stride_z));
-    DATA_TYPE d73 = *((__global DATA_TYPE *)(src_addr + 59 * src_stride_z));
-    DATA_TYPE d74 = *((__global DATA_TYPE *)(src_addr + 60 * src_stride_z));
-    DATA_TYPE d75 = *((__global DATA_TYPE *)(src_addr + 61 * src_stride_z));
-    DATA_TYPE d76 = *((__global DATA_TYPE *)(src_addr + 62 * src_stride_z));
-    DATA_TYPE d77 = *((__global DATA_TYPE *)(src_addr + 63 * src_stride_z));
-
-    // Compute the 8x4 intermediate tensor
-    VEC_DATA_TYPE(float, 4)
-    comm_fact0, comm_fact1, comm_fact2;
-    VEC_DATA_TYPE(float, 4)
-    tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;
-
-    COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);
-    COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);
-
-    // Compute the 4x4 output tile
-    comm_fact0 = tmp_col1 + tmp_col2;
-    comm_fact1 = tmp_col3 + tmp_col4;
-    comm_fact2 = tmp_col5 + tmp_col6;
-
-    VEC_DATA_TYPE(float, 4)
-    out_col0 = comm_fact0 + comm_fact1 + (float)8.f * comm_fact2 + tmp_col0;
-    VEC_DATA_TYPE(float, 4)
-    out_col2 = comm_fact0 + (float)4.f * comm_fact1 + (float)2.f * comm_fact2;
-
-    comm_fact0 = tmp_col1 - tmp_col2;
-    comm_fact1 = tmp_col3 - tmp_col4;
-    comm_fact2 = tmp_col5 - tmp_col6;
-
-    VEC_DATA_TYPE(float, 4)
-    out_col1 = comm_fact0 + (float)2.f * comm_fact1 + (float)4.f * comm_fact2;
-    VEC_DATA_TYPE(float, 4)
-    out_col3 = comm_fact0 + (float)8.f * comm_fact1 + comm_fact2 + tmp_col7;
-
-#if defined(HAS_BIAS)
-    // Add bias
-    Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);
-
-    float b = (float) * ((__global DATA_TYPE *)(vector_offset(&bias, z_out)));
-
-    out_col0 += (VEC_DATA_TYPE(float, 4))b;
-    out_col1 += (VEC_DATA_TYPE(float, 4))b;
-    out_col2 += (VEC_DATA_TYPE(float, 4))b;
-    out_col3 += (VEC_DATA_TYPE(float, 4))b;
-#endif // defined(HAS_BIAS)
-
-    // Store the output tile
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), A_VAL, B_VAL),
-                    VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), A_VAL, B_VAL),
-                    VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), A_VAL, B_VAL),
-                    VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr + 2 * dst_stride_y));
-    vstore4(CONVERT(ACTIVATION(ACTIVATION_TYPE, float, VEC_SIZE, (VEC_DATA_TYPE(float, 4))(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), A_VAL, B_VAL),
-                    VEC_DATA_TYPE(DATA_TYPE, 4)),
-            0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_5x5_nhwc(
-    TENSOR4D(src, BUFFER),
-    TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
-    const int mout = GET_SPATIAL_IDX(1, 1, 0);  // WINOGRAD OUTPUT TILES
-    const int bout = GET_SPATIAL_IDX(2, 1, 0);  // BATCH SIZE IDX
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    TILE(DATA_TYPE, 8, N0, in);
-    TILE(DATA_TYPE, 4, N0, out);
-    TILE(DATA_TYPE, 4, N0, tmp);
-    TILE(uint, 8, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        in[i].v = 0;
-    })
-
-    // "in" contains 1x8 or 8x1 tile here
-    T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // A^T * in, and in this degenerate case out consists of 1 column/row
-    tmp[0].v = in[1].v - in[2].v;
-    tmp[1].v = 2.0f * (in[3].v - in[4].v);
-    tmp[2].v = 2.0f * (in[5].v + in[6].v);
-    tmp[3].v = in[3].v + in[4].v;
-    out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
-    out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
-    out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
-    out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 4, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        int y_c              = min(y_out + yk, ((int)DST_HEIGHT - 1));
-        dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
-        dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#else  // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    LOOP_UNROLLING(int, xk, 0, 1, 4,
-    {
-        int x_c              = min(x_out + xk, ((int)DST_WIDTH - 1));
-        dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
-        dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-    })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-    // Calculate the indirect Y for the source tensor
-    TILE(DATA_TYPE, 64, N0, in);
-    TILE(DATA_TYPE, 6, N0, tmp);
-    TILE(uint, 64, 1, src_indirect_y);
-
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        src_indirect_y[i].v = mout + i * SRC_HEIGHT;
-        src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
-    })
-
-    // Initialize the input tile
-    LOOP_UNROLLING(int, i, 0, 1, 64,
-    {
-        in[i].v = 0;
-    })
-
-    // "in" here is 8x8 tile
-    T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
-    // A^T * in
-    LOOP_UNROLLING(int, i, 0, 1, 8,
-    {
-        tmp[0].v = in[8 + i].v + in[16 + i].v;
-        tmp[1].v = in[8 + i].v - in[16 + i].v;
-        tmp[2].v = in[24 + i].v + in[32 + i].v;
-        tmp[3].v = in[24 + i].v - in[32 + i].v;
-        tmp[3].v = tmp[3].v + tmp[3].v;
-        tmp[4].v = in[40 + i].v + in[48 + i].v;
-        tmp[4].v = tmp[4].v + tmp[4].v;
-        tmp[5].v = in[40 + i].v - in[48 + i].v;
-
-        // 4x8 matrix as a result
-        in[i].v      = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
-        in[8 + i].v  = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
-        in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
-        in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
-    })
-
-    // Compute the output tile
-    TILE(DATA_TYPE, 16, N0, out);
-
-    // in * A, with in = A^T * in as above
-    LOOP_UNROLLING(int, i, 0, 1, 4,
-    {
-        tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
-        tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
-        tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
-        tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
-        tmp[3].v = tmp[3].v + tmp[3].v;
-        tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
-        tmp[4].v = tmp[4].v + tmp[4].v;
-        tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
-
-        // 4x4 tile
-        out[4 * i].v     = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
-        out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
-        out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
-        out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
-    })
-
-#if defined(HAS_BIAS)
-    TILE(DATA_TYPE, 1, N0, b);
-
-    T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
-    // c = c + bias[broadcasted]
-    T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
-#endif // HAS_BIAS
-
-    int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
-    int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
-    T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
-    TILE(uint, 16, 1, dst_indirect_y);
-
-    // Calculate the destination indirect Y
-    LOOP_UNROLLING(int, yk, 0, 1, 4,
-    {
-        LOOP_UNROLLING(int, xk, 0, 1, 4,
-        {
-            int x_c                       = min(x_out + xk, ((int)DST_WIDTH - 1));
-            int y_c                       = min(y_out + yk, ((int)DST_HEIGHT - 1));
-            dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
-            dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
-        })
-    })
-
-    // Store the tile in reverse order so the invalid values are overwritten with the valid ones
-    T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x1_7x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_3x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_5x1_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_3x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_5x1_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#if defined(VEC_SIZE) && VEC_SIZE == 2
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x2_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_2x2_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x2_1x7_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_2x2_7x7_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 2
-
-#if defined(VEC_SIZE) && VEC_SIZE == 4
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x3_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_3x3_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x5_nchw(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst)
-#if defined(HAS_BIAS)
-    ,
-    VECTOR_DECLARATION(bias)
-#endif // defined(HAS_BIAS)
-)
-{
-    winograd_output_transform_4x4_5x5_nchw(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes
-#if defined(HAS_BIAS)
-                                           ,
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes
-#endif // defined(HAS_BIAS)
-                                          );
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x3_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_3x3_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: F32/F16
- * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in bytes)
- * @param[in]  src_step_x                        src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  src_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  src_step_w                        src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr                           Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  dst_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  dst_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  dst_step_w                        dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x5_nhwc(
-    TENSOR4D_DECLARATION(src),
-    TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
-    VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
-    int dst_size)
-{
-    winograd_output_transform_4x4_5x5_nhwc(src_ptr,
-                                           src_stride_x,
-                                           src_step_x,
-                                           src_stride_y,
-                                           src_step_y,
-                                           src_stride_z,
-                                           src_step_z,
-                                           src_stride_w,
-                                           src_step_w,
-                                           src_offset_first_element_in_bytes,
-                                           dst_ptr,
-                                           dst_stride_x,
-                                           dst_step_x,
-                                           dst_stride_y,
-                                           dst_step_y,
-                                           dst_stride_z,
-                                           dst_step_z,
-                                           dst_stride_w,
-                                           dst_step_w,
-                                           dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
-                                           bias_ptr,
-                                           bias_stride_x,
-                                           bias_step_x,
-                                           bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
-                                           dst_size);
-}
-#endif // defined(VEC_SIZE) && VEC_SIZE == 4
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
-- 
cgit v1.2.1