224 files changed, 36560 insertions, 29 deletions
diff --git a/src/gpu/cl/ClCompileContext.h b/src/gpu/cl/ClCompileContext.h
new file mode 100644
index 0000000000..e69cc0200f
--- /dev/null
+++ b/src/gpu/cl/ClCompileContext.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COMPILE_CONTEXT_H
+#define ARM_COMPUTE_CL_COMPILE_CONTEXT_H
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using ClCompileContext = arm_compute::CLCompileContext;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_COMPILE_CONTEXT_H */
diff --git a/src/gpu/cl/ClContext.cpp b/src/gpu/cl/ClContext.cpp
index d8ef18e62e..611c1cb501 100644
--- a/src/gpu/cl/ClContext.cpp
+++ b/src/gpu/cl/ClContext.cpp
@@ -23,11 +23,11 @@
  */
 #include "src/gpu/cl/ClContext.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/ClQueue.h"
 #include "src/gpu/cl/ClTensor.h"
 
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -41,7 +41,7 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
     bool                 status = false;
     mlgo::MLGOHeuristics heuristics;
 
-    if(filename != nullptr)
+    if (filename != nullptr)
     {
         status = heuristics.reload_from_file(filename);
     }
@@ -50,12 +50,9 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename)
 } // namespace
 
 ClContext::ClContext(const AclContextOptions *options)
-    : IContext(Target::GpuOcl),
-      _mlgo_heuristics(),
-      _cl_ctx(),
-      _cl_dev()
+    : IContext(Target::GpuOcl), _mlgo_heuristics(), _cl_ctx(), _cl_dev()
 {
-    if(options != nullptr)
+    if (options != nullptr)
     {
         _mlgo_heuristics = populate_mlgo(options->kernel_config_file);
     }
@@ -80,7 +77,7 @@ const mlgo::MLGOHeuristics &ClContext::mlgo() const
 
 bool ClContext::set_cl_ctx(::cl::Context ctx)
 {
-    if(this->refcount() == 0)
+    if (this->refcount() == 0)
     {
         _cl_ctx = ctx;
         CLScheduler::get().set_context(ctx);
@@ -92,7 +89,7 @@ bool ClContext::set_cl_ctx(::cl::Context ctx)
 ITensorV2 *ClContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
 {
     ClTensor *tensor = new ClTensor(this, desc);
-    if(tensor != nullptr && allocate)
+    if (tensor != nullptr && allocate)
     {
         tensor->allocate();
     }
diff --git a/src/gpu/cl/ClContext.h b/src/gpu/cl/ClContext.h
index 2a0d4ee1c8..2c67ccf4d2 100644
--- a/src/gpu/cl/ClContext.h
+++ b/src/gpu/cl/ClContext.h
@@ -24,11 +24,11 @@
 #ifndef SRC_GPU_CLCONTEXT_H
 #define SRC_GPU_CLCONTEXT_H
 
+#include "arm_compute/core/CL/OpenCL.h"
+
 #include "src/common/IContext.h"
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
-#include "arm_compute/core/CL/OpenCL.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -74,8 +74,12 @@ public:
     bool set_cl_ctx(::cl::Context ctx);
 
     // Inherrited methods overridden
-    ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
-    IQueue *create_queue(const AclQueueOptions *options) override;
+    ITensorV2                          *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue                             *create_queue(const AclQueueOptions *options) override;
+    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
+                                                          const AclTensorDescriptor     &dst,
+                                                          const AclActivationDescriptor &act,
+                                                          bool                           is_validate) override;
 
 private:
     mlgo::MLGOHeuristics _mlgo_heuristics;
@@ -86,4 +90,4 @@ private:
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLCONTEXT_H */
-\ No newline at end of file
+#endif /* SRC_GPU_CLCONTEXT_H */
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
new file mode 100644
index 0000000000..c4117b8a1a
--- /dev/null
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -0,0 +1,1058 @@
+/*
+ * Copyright (c) 2016-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/ClKernelLibrary.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+
+#include <algorithm>
+#include <array>
+#include <fstream>
+#include <utility>
+
+#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
+#include <zlib.h>
+
+namespace
+{
+/* Decoding table */
+constexpr std::array<uint8_t, 256> b64_invtab = {
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  62, 0,  0,  0,  63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+    0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+    22, 23, 24, 25, 0,  0,  0,  0, 0, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+    45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+};
+
+/** Decode a base64 encoded string
+ *
+ * @param[in] str Base64 encoded string to decode
+ *
+ * @return The decode string in case of a valid, non-empty string otherwise an empty string
+ */
+std::string decode_base64(const std::string &str)
+{
+    constexpr const char pad_char = '=';
+
+    // Handle empty string
+    if (str.empty())
+    {
+        return {};
+    }
+
+    // Base64 encoded string has size multiple of 4
+    if (str.length() % 4)
+    {
+        return {};
+    }
+
+    //
+    // Check encoded string padding
+    std::size_t padding = (str.rbegin()[0] == pad_char) + (str.rbegin()[1] == pad_char);
+    const int   str_len = str.size();
+
+    // Reserve memory for the decoded string
+    // Note each 4 consecutive elements of 6-bit encode 3 bytes
+    std::string dec_b64;
+    dec_b64.reserve(((str_len / 4) * 3));
+
+    // Block decoding function (exclude padding)
+    int       c   = 0;
+    const int end = str_len - 4 - padding;
+    for (; c <= end; c += 4)
+    {
+        const int byte0 = b64_invtab[str[c]];
+        const int byte1 = b64_invtab[str[c + 1]];
+        const int byte2 = b64_invtab[str[c + 2]];
+        const int byte3 = b64_invtab[str[c + 3]];
+
+        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
+        dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
+        dec_b64.push_back((byte2 << 6) | (byte3));
+    }
+
+    // Last step that might contain padding symbols
+    if (padding == 1)
+    {
+        const int byte0 = b64_invtab[str[c]];
+        const int byte1 = b64_invtab[str[c + 1]];
+        const int byte2 = b64_invtab[str[c + 2]];
+
+        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
+        dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
+    }
+    else if (padding == 2)
+    {
+        const int byte0 = b64_invtab[str[c]];
+        const int byte1 = b64_invtab[str[c + 1]];
+
+        dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
+    }
+
+    return dec_b64;
+}
+
+/** Decompress a zlib compressed string
+ *
+ * @param[in] str ZLib compressed string
+ *
+ * @return The decompressed string if successful, otherwise false.
+ */
+std::string decompress_zlib(const std::string &str)
+{
+    // Create and initialize decompression stream
+    z_stream ds{};
+    if (inflateInit(&ds) != Z_OK)
+    {
+        return std::string();
+    }
+    ds.avail_in = str.size();
+    ds.next_in  = (Bytef *)str.data();
+
+    // Roll-over the string using a buffer and decompress
+    int         status = Z_OK;
+    char        roll_buff[16384];
+    std::string inflated_str;
+    do
+    {
+        ds.avail_out = sizeof(roll_buff);
+        ds.next_out  = reinterpret_cast<Bytef *>(roll_buff);
+
+        status = inflate(&ds, 0);
+        if (inflated_str.size() < ds.total_out)
+        {
+            inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
+        }
+    } while (status == Z_OK);
+
+    // Finalize decompression stream
+    inflateEnd(&ds);
+    if (status != Z_STREAM_END)
+    {
+        return std::string();
+    }
+
+    return inflated_str;
+}
+} // namespace
+#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
+
+namespace arm_compute
+{
+namespace opencl
+{
+const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = {
+    // Common Kernels
+    {"activation_layer", "common/activation_layer.cl"},
+    {"activation_layer_quant", "common/activation_layer_quant.cl"},
+    {"activation_layer_quant_f32", "common/activation_layer_quant.cl"},
+    {"arg_min_max_x", "common/arg_min_max.cl"},
+    {"arg_min_max_y", "common/arg_min_max.cl"},
+    {"arg_min_max_z", "common/arg_min_max.cl"},
+    {"arg_min_max_w", "common/arg_min_max.cl"},
+    {"bitwise_or", "common/bitwise_op.cl"},
+    {"bitwise_and", "common/bitwise_op.cl"},
+    {"bitwise_xor", "common/bitwise_op.cl"},
+    {"bitwise_not", "common/bitwise_op.cl"},
+    {"bounding_box_transform", "common/bounding_box_transform.cl"},
+    {"bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl"},
+    {"compare_equal", "common/comparisons.cl"},
+    {"compare_equal_quantized", "common/comparisons.cl"},
+    {"compare_notequal", "common/comparisons.cl"},
+    {"compare_notequal_quantized", "common/comparisons.cl"},
+    {"compare_greater", "common/comparisons.cl"},
+    {"compare_greater_quantized", "common/comparisons.cl"},
+    {"compare_greaterequal", "common/comparisons.cl"},
+    {"compare_greaterequal_quantized", "common/comparisons.cl"},
+    {"compare_less", "common/comparisons.cl"},
+    {"compare_less_quantized", "common/comparisons.cl"},
+    {"compare_lessequal", "common/comparisons.cl"},
+    {"compare_lessequal_quantized", "common/comparisons.cl"},
+    {"concatenate", "common/concatenate.cl"},
+    {"concatenate_width", "common/concatenate.cl"},
+    {"concatenate_height", "common/concatenate.cl"},
+    {"concatenate_width_x2", "common/concatenate.cl"},
+    {"concatenate_width_x4", "common/concatenate.cl"},
+    {"col2im", "common/col2im.cl"},
+    {"cast_down", "common/cast.cl"},
+    {"cast_up", "common/cast.cl"},
+    {"convert_fc_weights", "common/convert_fc_weights.cl"},
+    {"copy_tensor", "common/copy_tensor.cl"},
+    {"crop_tensor", "common/crop_tensor.cl"},
+    {"deconvolution_reshape", "common/deconvolution_layer.cl"},
+    {"deconvolution_upsample", "common/deconvolution_layer.cl"},
+    {"dequantization_layer", "common/dequantization_layer.cl"},
+    {"elementwise_operation_ADD", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SUB", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MAX", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MIN", "common/elementwise_operation.cl"},
+    {"elementwise_operation_DIV", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl"},
+    {"elementwise_operation_POWER", "common/elementwise_operation.cl"},
+    {"elementwise_operation_PRELU", "common/elementwise_operation.cl"},
+    {"elementwise_operation_AND", "common/elementwise_operation.cl"},
+    {"elementwise_operation_OR", "common/elementwise_operation.cl"},
+    {"elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_unary", "common/elementwise_unary.cl"},
+    {"elementwise_unary_quantized", "common/elementwise_unary_quantized.cl"},
+    {"fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl"},
+    {"fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl"},
+    {"fft_radix_2_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_2_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_2_axis_0", "common/fft.cl"},
+    {"fft_radix_2_axis_1", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_3_axis_0", "common/fft.cl"},
+    {"fft_radix_3_axis_1", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_4_axis_0", "common/fft.cl"},
+    {"fft_radix_4_axis_1", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_5_axis_0", "common/fft.cl"},
+    {"fft_radix_5_axis_1", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_7_axis_0", "common/fft.cl"},
+    {"fft_radix_7_axis_1", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_8_axis_0", "common/fft.cl"},
+    {"fft_radix_8_axis_1", "common/fft.cl"},
+    {"fft_scale_conj", "common/fft_scale.cl"},
+    {"fill_image_borders_constant", "common/fill_border.cl"},
+    {"fill_image_borders_replicate", "common/fill_border.cl"},
+    {"floor_layer", "common/floor.cl"},
+    {"fuse_batchnormalization_layer", "common/batchnormalization_layer.cl"},
+    {"gather", "common/gather.cl"},
+    {"gemm_ma_f16", "common/gemm.cl"},
+    {"gemm_ma_f32", "common/gemm.cl"},
+    {"gemm_mv", "common/gemv.cl"},
+    {"gemm_mv_quantized", "common/gemv.cl"},
+    {"gemm_mm_native", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_lc_vm_f32", "common/gemm.cl"},
+    {"gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemmlowp_matrix_a_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_b_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_native", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl"},
+    {"gemmlowp_offset_contribution", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl"},
+    {"generate_proposals_compute_all_anchors", "common/generate_proposals.cl"},
+    {"generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl"},
+    {"instance_normalization", "common/instance_normalization.cl"},
+    {"compute_mean_var", "common/instance_normalization.cl"},
+    {"l2_normalize_x", "common/l2_normalize.cl"},
+    {"l2_normalize_y", "common/l2_normalize.cl"},
+    {"l2_normalize_z", "common/l2_normalize.cl"},
+    {"mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_nt_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_nt_t", "common/mat_mul.cl"},
+    {"mat_mul_native_t_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_t_t", "common/mat_mul.cl"},
+    {"mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl"},
+    {"max_unpooling_layer_2", "common/unpooling_layer.cl"},
+    {"mean_stddev_normalization", "common/mean_stddev_normalization.cl"},
+    {"memset", "common/memset.cl"},
+    {"minmax_layer", "common/minmax_layer.cl"},
+    {"non_max_suppression", "common/nonmax.cl"},
+    {"pad_layer_constant", "common/pad_layer.cl"},
+    {"pad_layer_symmetric_reflect", "common/pad_layer.cl"},
+    {"permute", "common/permute.cl"},
+    {"pixelwise_mul_complex", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_float", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_int", "common/pixelwise_mul_int.cl"},
+    {"pixelwise_mul_quantized", "common/pixelwise_mul_int.cl"},
+    {"qlstm_layer_normalization", "common/qlstm_layer_normalization.cl"},
+    {"quantization_layer", "common/quantization_layer.cl"},
+    {"range", "common/range.cl"},
+    {"range_quantized", "common/range.cl"},
+    {"reduction_operation_x", "common/reduction_operation.cl"},
+    {"reduction_operation_non_parallel_x", "common/reduction_operation.cl"},
+    {"reduction_operation_y", "common/reduction_operation.cl"},
+    {"reduction_operation_z", "common/reduction_operation.cl"},
+    {"reduction_operation_w", "common/reduction_operation.cl"},
+    {"reshape_layer", "common/reshape_layer.cl"},
+    {"reshape_to_columns", "common/convolution_layer.cl"},
+    {"reverse", "common/reverse.cl"},
+    {"roi_align_layer", "common/roi_align_layer.cl"},
+    {"roi_align_layer_quantized", "common/roi_align_layer_quantized.cl"},
+    {"roi_pooling_layer", "common/roi_pooling_layer.cl"},
+    {"select_same_rank", "common/select.cl"},
+    {"select_different_rank_2", "common/select.cl"},
+    {"select_different_rank_n", "common/select.cl"},
+    {"softmax_x", "common/softmax_layer.cl"},
+    {"softmax_non_x", "common/softmax_layer.cl"},
+    {"stack_layer", "common/stack_layer.cl"},
+    {"strided_slice", "common/slice_ops.cl"},
+    {"tile", "common/tile.cl"},
+    {"transpose", "common/transpose.cl"},
+#ifdef ENABLE_NCHW_KERNELS
+    {"batch_to_space_nchw", "nchw/batch_to_space.cl"},
+    {"batch_to_space_static_nchw", "nchw/batch_to_space.cl"},
+    {"batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl"},
+    {"channel_shuffle_nchw", "nchw/channel_shuffle.cl"},
+    {"depth_to_space_nchw", "nchw/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl"},
+    {"direct_convolution1x1", "nchw/direct_convolution1x1.cl"},
+    {"direct_convolution_nchw", "nchw/direct_convolution.cl"},
+
+    {"im2col1x1_stridex1_nchw", "nchw/im2col.cl"},
+    {"im2col3x3_nchw", "nchw/im2col.cl"},
+    {"im2col5x5_nchw", "nchw/im2col.cl"},
+    {"im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalization_layer_in_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nchw", "nchw/pooling_layer.cl"},
+    {"pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl"},
+    {"prior_box_layer_nchw", "nchw/prior_box_layer.cl"},
+    {"reorg_layer_nchw", "nchw/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nchw", "nchw/scale.cl"},
+    {"scale_bilinear_nchw", "nchw/scale.cl"},
+    {"space_to_batch_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_batch_static_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_depth_nchw", "nchw/space_to_depth.cl"},
+    {"upsample_layer_nchw", "nchw/upsample_layer.cl"},
+    {"winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl"},
+#endif /* ENABLE_NCHW_KERNELS */
+#ifdef ENABLE_NHWC_KERNELS
+    {"batch_to_space_nhwc", "nhwc/batch_to_space.cl"},
+    {"batch_to_space_static_nhwc", "nhwc/batch_to_space.cl"},
+    {"batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl"},
+    {"channel_shuffle_nhwc", "nhwc/channel_shuffle.cl"},
+    {"depth_to_space_nhwc", "nhwc/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl"},
+    {"dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl"},
+    {"dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl"},
+    {"direct_convolution_nhwc", "nhwc/direct_convolution.cl"},
+    {"direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl"},
+    {"im2col3x3_nhwc", "nhwc/im2col.cl"},
+    {"im2col9x9_nhwc", "nhwc/im2col.cl"},
+    {"im2col_generic_nhwc", "nhwc/im2col.cl"},
+    {"indirect_convolution_nhwc", "nhwc/indirect_convolution.cl"},
+    {"indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl"},
+    {"normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl"},
+    {"pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl"},
+    {"pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl"},
+    {"reorg_layer_nhwc", "nhwc/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nhwc", "nhwc/scale.cl"},
+    {"scale_bilinear_nhwc", "nhwc/scale.cl"},
+    {"scatter_mp1d_2d_mpnd", "common/scatter.cl"},
+    {"scatter1D", "common/scatter.cl"},
+    {"space_to_batch_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_batch_static_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_depth_nhwc", "nhwc/space_to_depth.cl"},
+    {"transposed_convolution_nhwc", "nhwc/transposed_convolution.cl"},
+    {"upsample_layer_nhwc", "nhwc/upsample_layer.cl"},
+    {"winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl"},
+#endif /* ENABLE_NHWC_KERNELS */
+};
+
+const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = {
+#ifdef EMBEDDED_KERNELS
+    {
+        "activation_float_helpers.h",
+#include "./cl_kernels/activation_float_helpers.hembed"
+    },
+    {
+        "activation_quant_helpers.h",
+#include "./cl_kernels/activation_quant_helpers.hembed"
+    },
+    {
+        "common/activation_layer.cl",
+#include "./cl_kernels/common/activation_layer.clembed"
+    },
+    {
+        "common/activation_layer_quant.cl",
+#include "./cl_kernels/common/activation_layer_quant.clembed"
+    },
+    {
+        "common/arg_min_max.cl",
+#include "./cl_kernels/common/arg_min_max.clembed"
+    },
+    {
+        "common/bitwise_op.cl",
+#include "./cl_kernels/common/bitwise_op.clembed"
+    },
+    {
+        "common/bounding_box_transform.cl",
+#include "./cl_kernels/common/bounding_box_transform.clembed"
+    },
+    {
+        "common/bounding_box_transform_quantized.cl",
+#include "./cl_kernels/common/bounding_box_transform_quantized.clembed"
+    },
+    {
+        "common/col2im.cl",
+#include "./cl_kernels/common/col2im.clembed"
+    },
+    {
+        "common/comparisons.cl",
+#include "./cl_kernels/common/comparisons.clembed"
+    },
+    {
+        "common/concatenate.cl",
+#include "./cl_kernels/common/concatenate.clembed"
+    },
+    {
+        "common/convert_fc_weights.cl",
+#include "./cl_kernels/common/convert_fc_weights.clembed"
+    },
+    {
+        "common/convolution_layer.cl",
+#include "./cl_kernels/common/convolution_layer.clembed"
+    },
+    {
+        "common/copy_tensor.cl",
+#include "./cl_kernels/common/copy_tensor.clembed"
+    },
+    {
+        "common/crop_tensor.cl",
+#include "./cl_kernels/common/crop_tensor.clembed"
+    },
+    {
+        "common/deconvolution_layer.cl",
+#include "./cl_kernels/common/deconvolution_layer.clembed"
+    },
+    {
+        "common/cast.cl",
+#include "./cl_kernels/common/cast.clembed"
+    },
+    {
+        "common/dequantization_layer.cl",
+#include "./cl_kernels/common/dequantization_layer.clembed"
+    },
+    {
+        "common/elementwise_operation.cl",
+#include "./cl_kernels/common/elementwise_operation.clembed"
+    },
+    {
+        "common/elementwise_operation_quantized.cl",
+#include "./cl_kernels/common/elementwise_operation_quantized.clembed"
+    },
+    {
+        "common/elementwise_unary.cl",
+#include "./cl_kernels/common/elementwise_unary.clembed"
+    },
+    {
+        "common/elementwise_unary_quantized.cl",
+#include "./cl_kernels/common/elementwise_unary_quantized.clembed"
+    },
+    {
+        "common/fft.cl",
+#include "./cl_kernels/common/fft.clembed"
+    },
+    {
+        "common/fft_digit_reverse.cl",
+#include "./cl_kernels/common/fft_digit_reverse.clembed"
+    },
+    {
+        "common/fft_scale.cl",
+#include "./cl_kernels/common/fft_scale.clembed"
+    },
+    {
+        "common/fill_border.cl",
+#include "./cl_kernels/common/fill_border.clembed"
+    },
+    {
+        "common/floor.cl",
+#include "./cl_kernels/common/floor.clembed"
+    },
+    {
+        "common/gather.cl",
+#include "./cl_kernels/common/gather.clembed"
+    },
+    {
+        "common/scatter.cl",
+#include "./cl_kernels/common/scatter.clembed"
+    },
+    {
+        "common/gemm.cl",
+#include "./cl_kernels/common/gemm.clembed"
+    },
+    {
+        "common/gemm_reshaped_only_rhs_mmul.cl",
+#include "./cl_kernels/common/gemm_reshaped_only_rhs_mmul.clembed"
+    },
+    {
+        "common/gemm_utils.cl",
+#include "./cl_kernels/common/gemm_utils.clembed"
+    },
+    {
+        "common/gemmlowp.cl",
+#include "./cl_kernels/common/gemmlowp.clembed"
+    },
+    {
+        "common/gemmlowp_reshaped_only_rhs_mmul.cl",
+#include "./cl_kernels/common/gemmlowp_reshaped_only_rhs_mmul.clembed"
+    },
+    {
+        "common/gemv.cl",
+#include "./cl_kernels/common/gemv.clembed"
+    },
+    {
+        "common/generate_proposals.cl",
+#include "./cl_kernels/common/generate_proposals.clembed"
+    },
+    {
+        "common/generate_proposals_quantized.cl",
+#include "./cl_kernels/common/generate_proposals_quantized.clembed"
+    },
+    {
+        "gemm_helpers.h",
+#include "./cl_kernels/gemm_helpers.hembed"
+    },
+    {
+        "helpers.h",
+#include "./cl_kernels/helpers.hembed"
+    },
+    {
+        "helpers_asymm.h",
+#include "./cl_kernels/helpers_asymm.hembed"
+    },
+    {
+        "repeat.h",
+#include "./cl_kernels/repeat.hembed"
+    },
+    {
+        "tile_helpers.h",
+#include "./cl_kernels/tile_helpers.hembed"
+    },
+    {
+        "common/instance_normalization.cl",
+#include "./cl_kernels/common/instance_normalization.clembed"
+    },
+    {
+        "common/l2_normalize.cl",
+#include "./cl_kernels/common/l2_normalize.clembed"
+    },
+    {
+        "common/mean_stddev_normalization.cl",
+#include "./cl_kernels/common/mean_stddev_normalization.clembed"
+    },
+    {
+        "common/memset.cl",
+#include "./cl_kernels/common/memset.clembed"
+    },
+    {
+        "common/minmax_layer.cl",
+#include "./cl_kernels/common/minmax_layer.clembed"
+    },
+    {
+        "common/nonmax.cl",
+#include "./cl_kernels/common/nonmax.clembed"
+    },
+    {
+        "common/batchnormalization_layer.cl",
+#include "./cl_kernels/common/batchnormalization_layer.clembed"
+    },
+    {
+        "common/pad_layer.cl",
+#include "./cl_kernels/common/pad_layer.clembed"
+    },
+    {
+        "common/permute.cl",
+#include "./cl_kernels/common/permute.clembed"
+    },
+    {
+        "common/pixelwise_mul_float.cl",
+#include "./cl_kernels/common/pixelwise_mul_float.clembed"
+    },
+    {
+        "common/pixelwise_mul_int.cl",
+#include "./cl_kernels/common/pixelwise_mul_int.clembed"
+    },
+    {
+        "common/qlstm_layer_normalization.cl",
+#include "./cl_kernels/common/qlstm_layer_normalization.clembed"
+    },
+    {
+        "common/quantization_layer.cl",
+#include "./cl_kernels/common/quantization_layer.clembed"
+    },
+    {
+        "common/range.cl",
+#include "./cl_kernels/common/range.clembed"
+    },
+    {
+        "common/reduction_operation.cl",
+#include "./cl_kernels/common/reduction_operation.clembed"
+    },
+    {
+        "common/reshape_layer.cl",
+#include "./cl_kernels/common/reshape_layer.clembed"
+    },
+    {
+        "common/reverse.cl",
+#include "./cl_kernels/common/reverse.clembed"
+    },
+    {
+        "common/roi_align_layer.cl",
+#include "./cl_kernels/common/roi_align_layer.clembed"
+    },
+    {
+        "common/roi_align_layer_quantized.cl",
+#include "./cl_kernels/common/roi_align_layer_quantized.clembed"
+    },
+    {
+        "common/roi_pooling_layer.cl",
+#include "./cl_kernels/common/roi_pooling_layer.clembed"
+    },
+    {
+        "common/select.cl",
+#include "./cl_kernels/common/select.clembed"
+    },
+    {
+        "common/softmax_layer.cl",
+#include "./cl_kernels/common/softmax_layer.clembed"
+    },
+    {
+        "common/slice_ops.cl",
+#include "./cl_kernels/common/slice_ops.clembed"
+    },
+    {
+        "common/stack_layer.cl",
+#include "./cl_kernels/common/stack_layer.clembed"
+    },
+    {
+        "common/tile.cl",
+#include "./cl_kernels/common/tile.clembed"
+    },
+    {
+        "common/transpose.cl",
+#include "./cl_kernels/common/transpose.clembed"
+    },
+    {
+        "types.h",
+#include "./cl_kernels/types.hembed"
+    },
+    {
+        "common/unpooling_layer.cl",
+#include "./cl_kernels/common/unpooling_layer.clembed"
+    },
+    {
+        "common/mat_mul.cl",
+#include "./cl_kernels/common/mat_mul.clembed"
+    },
+    {
+        "common/mat_mul_mmul.cl",
+#include "./cl_kernels/common/mat_mul_mmul.clembed"
+    },
+    {
+        "common/mat_mul_quantized.cl",
+#include "./cl_kernels/common/mat_mul_quantized.clembed"
+    },
+    {
+        "common/mat_mul_quantized_mmul.cl",
+#include "./cl_kernels/common/mat_mul_quantized_mmul.clembed"
+    },
+#ifdef ENABLE_NCHW_KERNELS
+    {
+        "nchw/batch_to_space.cl",
+#include "./cl_kernels/nchw/batch_to_space.clembed"
+    },
+    {
+        "nchw/channel_shuffle.cl",
+#include "./cl_kernels/nchw/channel_shuffle.clembed"
+    },
+    {
+        "nchw/upsample_layer.cl",
+#include "./cl_kernels/nchw/upsample_layer.clembed"
+    },
+    {
+        "nchw/depth_to_space.cl",
+#include "./cl_kernels/nchw/depth_to_space.clembed"
+    },
+    {
+        "nchw/dequantization_layer.cl",
+#include "./cl_kernels/nchw/dequantization_layer.clembed"
+    },
+    {
+        "nchw/direct_convolution.cl",
+#include "./cl_kernels/nchw/direct_convolution.clembed"
+    },
+    {
+        "nchw/im2col.cl",
+#include "./cl_kernels/nchw/im2col.clembed"
+    },
+    {
+        "nchw/normalization_layer.cl",
+#include "./cl_kernels/nchw/normalization_layer.clembed"
+    },
+    {
+        "nchw/normalize_planar_yuv_layer.cl",
+#include "./cl_kernels/nchw/normalize_planar_yuv_layer.clembed"
+    },
+    {
+        "nchw/normalize_planar_yuv_layer_quantized.cl",
+#include "./cl_kernels/nchw/normalize_planar_yuv_layer_quantized.clembed"
+    },
+    {
+        "nchw/batchnormalization_layer.cl",
+#include "./cl_kernels/nchw/batchnormalization_layer.clembed"
+    },
+    {
+        "nchw/pooling_layer.cl",
+#include "./cl_kernels/nchw/pooling_layer.clembed"
+    },
+    {
+        "nchw/prior_box_layer.cl",
+#include "./cl_kernels/nchw/prior_box_layer.clembed"
+    },
+    {
+        "nchw/reorg_layer.cl",
+#include "./cl_kernels/nchw/reorg_layer.clembed"
+    },
+    {
+        "nchw/scale.cl",
+#include "./cl_kernels/nchw/scale.clembed"
+    },
+    {
+        "nchw/space_to_batch.cl",
+#include "./cl_kernels/nchw/space_to_batch.clembed"
+    },
+    {
+        "nchw/space_to_depth.cl",
+#include "./cl_kernels/nchw/space_to_depth.clembed"
+    },
+    {
+        "nchw/winograd_filter_transform.cl",
+#include "./cl_kernels/nchw/winograd_filter_transform.clembed"
+    },
+    {
+        "nchw/winograd_input_transform.cl",
+#include "./cl_kernels/nchw/winograd_input_transform.clembed"
+    },
+    {
+        "nchw/winograd_output_transform.cl",
+#include "./cl_kernels/nchw/winograd_output_transform.clembed"
+    },
+#endif /* ENABLE_NCHW_KERNELS */
+
+#ifdef ENABLE_NHWC_KERNELS
+    {
+        "nhwc/batch_to_space.cl",
+#include "./cl_kernels/nhwc/batch_to_space.clembed"
+    },
+    {
+        "nhwc/channel_shuffle.cl",
+#include "./cl_kernels/nhwc/channel_shuffle.clembed"
+    },
+    {
+        "nhwc/upsample_layer.cl",
+#include "./cl_kernels/nhwc/upsample_layer.clembed"
+    },
+    {
+        "nhwc/depth_to_space.cl",
+#include "./cl_kernels/nhwc/depth_to_space.clembed"
+    },
+    {
+        "nhwc/dequantization_layer.cl",
+#include "./cl_kernels/nhwc/dequantization_layer.clembed"
+    },
+    {
+        "nhwc/direct_convolution.cl",
+#include "./cl_kernels/nhwc/direct_convolution.clembed"
+    },
+    {
+        "nhwc/direct_convolution3d.cl",
+#include "./cl_kernels/nhwc/direct_convolution3d.clembed"
+    },
+    {
+        "nhwc/dwc_native_fp_nhwc.cl",
+#include "./cl_kernels/nhwc/dwc_native_fp_nhwc.clembed"
+    },
+    {
+        "nhwc/dwc_native_quantized_nhwc.cl",
+#include "./cl_kernels/nhwc/dwc_native_quantized_nhwc.clembed"
+    },
+    {
+        "nhwc/normalization_layer.cl",
+#include "./cl_kernels/nhwc/normalization_layer.clembed"
+    },
+    {
+        "nhwc/normalize_planar_yuv_layer.cl",
+#include "./cl_kernels/nhwc/normalize_planar_yuv_layer.clembed"
+    },
+    {
+        "nhwc/normalize_planar_yuv_layer_quantized.cl",
+#include "./cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.clembed"
+    },
+    {
+        "nhwc/im2col.cl",
+#include "./cl_kernels/nhwc/im2col.clembed"
+    },
+    {
+        "nhwc/indirect_convolution.cl",
+#include "./cl_kernels/nhwc/indirect_convolution.clembed"
+    },
+    {
+        "nhwc/batchnormalization_layer.cl",
+#include "./cl_kernels/nhwc/batchnormalization_layer.clembed"
+    },
+    {
+        "nhwc/pooling_layer.cl",
+#include "./cl_kernels/nhwc/pooling_layer.clembed"
+    },
+    {
+        "nhwc/pooling_3d_layer.cl",
+#include "./cl_kernels/nhwc/pooling_3d_layer.clembed"
+    },
+    {
+        "nhwc/pooling_3d_layer_quantized.cl",
+#include "./cl_kernels/nhwc/pooling_3d_layer_quantized.clembed"
+    },
+    {
+        "nhwc/pooling_layer_quantized.cl",
+#include "./cl_kernels/nhwc/pooling_layer_quantized.clembed"
+    },
+    {
+        "nhwc/reorg_layer.cl",
+#include "./cl_kernels/nhwc/reorg_layer.clembed"
+    },
+    {
+        "nhwc/scale.cl",
+#include "./cl_kernels/nhwc/scale.clembed"
+    },
+    {
+        "nhwc/space_to_batch.cl",
+#include "./cl_kernels/nhwc/space_to_batch.clembed"
+    },
+    {
+        "nhwc/space_to_depth.cl",
+#include "./cl_kernels/nhwc/space_to_depth.clembed"
+    },
+    {
+        "nhwc/transposed_convolution.cl",
+#include "./cl_kernels/nhwc/transposed_convolution.clembed"
+    },
+    {
+        "nhwc/winograd_filter_transform.cl",
+#include "./cl_kernels/nhwc/winograd_filter_transform.clembed"
+    },
+    {
+        "nhwc/winograd_input_transform.cl",
+#include "./cl_kernels/nhwc/winograd_input_transform.clembed"
+    },
+    {
+        "nhwc/winograd_output_transform.cl",
+#include "./cl_kernels/nhwc/winograd_output_transform.clembed"
+    },
+#endif /* ENABLE_NHWC_KERNELS */
+#endif /* EMBEDDED_KERNELS */
+};
+
+ClKernelLibrary &ClKernelLibrary::get()
+{
+    static ClKernelLibrary _kernel_library;
+    return _kernel_library;
+}
+
+std::string ClKernelLibrary::program_name(const std::string &kernel_name) const
+{
+    // Find which program contains the kernel
+    auto kernel_program_it = _kernel_program_map.find(kernel_name);
+
+    if (_kernel_program_map.end() == kernel_program_it)
+    {
+        ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
+    }
+
+    const std::string program_name = kernel_program_it->second;
+
+    return program_name;
+}
+
+void ClKernelLibrary::set_kernel_path(std::string kernel_path)
+{
+    _kernel_path = std::move(kernel_path);
+    _kernel_path += "/";
+}
+
+const std::string &ClKernelLibrary::kernel_path() const
+{
+    return _kernel_path;
+}
+
+ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &program_name) const
+{
+#ifdef EMBEDDED_KERNELS
+#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
+    const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
+    if (inflatted_program_source_it != _decompressed_source_map.end())
+    {
+        return ClProgramInfo{inflatted_program_source_it->second, false};
+    }
+#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
+
+    const auto program_source_it = _program_source_map.find(program_name);
+    if (program_source_it == _program_source_map.end())
+    {
+        ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
+    }
+    std::string program_source = program_source_it->second;
+
+#ifdef ARM_COMPUTE_COMPRESSED_KERNELS
+    std::string decompressed_program_source = decompress_zlib(decode_base64(program_source_it->second));
+    ARM_COMPUTE_ERROR_ON_MSG(decompressed_program_source.empty(), "Cannot de-compress requested program");
+    _decompressed_source_map.insert(std::make_pair(program_name, decompressed_program_source));
+    program_source = std::move(decompressed_program_source);
+#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
+
+    return ClProgramInfo{program_source, false};
+#else  /* EMBEDDED_KERNELS */
+    // Check for binary
+    std::string source_name = _kernel_path + program_name;
+    std::string binary_name = source_name + "bin";
+    std::string program_source{};
+    bool        is_binary = false;
+
+    if (std::ifstream(binary_name).is_open())
+    {
+        program_source = read_file(binary_name, true);
+        is_binary      = true;
+    }
+    else if (std::ifstream(source_name).is_open())
+    {
+        program_source = read_file(source_name, false);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
+    }
+
+    return ClProgramInfo{program_source, is_binary};
+#endif /* EMBEDDED_KERNELS */
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/ClKernelLibrary.h b/src/gpu/cl/ClKernelLibrary.h
new file mode 100644
index 0000000000..cd1d689199
--- /dev/null
+++ b/src/gpu/cl/ClKernelLibrary.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_KERNEL_LIBRARY_H
+#define ARM_COMPUTE_CL_KERNEL_LIBRARY_H
+
+#include <map>
+#include <string>
+#include <tuple>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** ClKernelLibrary contains all the OpenCL kernels that are used throughout the library
+ *
+ * @note Kernel library is a singleton to reduce memory requirements
+ * @note Sole responsibility is just to provide access to the kernel string,
+ *       does not perform any compilation and relevant tasks
+ */
+class ClKernelLibrary final
+{
+private:
+    /** Default Constructor */
+    ClKernelLibrary() = default;
+    /** Prevent instances of this class from being copied */
+    ClKernelLibrary(const ClKernelLibrary &) = delete;
+    /** Prevent instances of this class from being copied */
+    const ClKernelLibrary &operator=(const ClKernelLibrary &) = delete;
+
+public:
+    /** Structure to encapsulte program related information */
+    struct ClProgramInfo
+    {
+        std::string program{};        /**< Program raw string */
+        bool        is_binary{false}; /**< Flag that indicates if is in binary format */
+    };
+
+public:
+    /** Access the KernelLibrary singleton
+     *
+     * @return The KernelLibrary instance
+     */
+    static ClKernelLibrary &get();
+    /** Sets the path that the kernels reside in
+     *
+     * @param[in] kernel_path Path of the kernel
+     */
+    void set_kernel_path(std::string kernel_path);
+    /** Gets the path that the kernels reside in
+     */
+    const std::string &kernel_path() const;
+    /** Gets the source of the selected program
+     *
+     * @param[in] program_name Program name
+     *
+     * @return A pair with the source (false) or the binary (true), of the selected program
+     */
+    ClProgramInfo program(const std::string &program_name) const;
+    /** Returns the program name given a kernel name
+     *
+     * @return Program name
+     */
+    std::string program_name(const std::string &kernel_name) const;
+
+private:
+    std::string _kernel_path{}; /**< Path to the kernels folder. */
+    mutable std::map<std::string, std::string>
+        _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
+    static const std::map<std::string, std::string>
+        _kernel_program_map; /**< Map that associates kernel names with programs. */
+    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
+                                                                                     Used for compile-time kernel inclusion. >*/
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_KERNEL_LIBRARY_H */
diff --git a/src/gpu/cl/ClQueue.cpp b/src/gpu/cl/ClQueue.cpp
index 2123adcf39..0cb7af5b61 100644
--- a/src/gpu/cl/ClQueue.cpp
+++ b/src/gpu/cl/ClQueue.cpp
@@ -36,7 +36,7 @@ namespace
 {
 CLTunerMode map_tuner_mode(AclTuningMode mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case AclRapid:
             return CLTunerMode::RAPID;
@@ -55,7 +55,7 @@ CLTunerMode map_tuner_mode(AclTuningMode mode)
 
 std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 {
-    if(options == nullptr || options->mode == AclTuningModeNone)
+    if (options == nullptr || options->mode == AclTuningModeNone)
     {
         return nullptr;
     }
@@ -68,8 +68,7 @@ std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 }
 } // namespace
 
-ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options)
-    : IQueue(ctx), _tuner(nullptr)
+ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx), _tuner(nullptr)
 {
     _tuner = populate_tuner(options);
 }
diff --git a/src/gpu/cl/ClQueue.h b/src/gpu/cl/ClQueue.h
index b16a0f4e83..09ffb06cf3 100644
--- a/src/gpu/cl/ClQueue.h
+++ b/src/gpu/cl/ClQueue.h
@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLQUEUE_H
 #define SRC_GPU_CLQUEUE_H
 
-#include "src/common/IQueue.h"
-
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/IQueue.h"
+
 #include <memory>
 
 namespace arm_compute
diff --git a/src/gpu/cl/ClTensor.cpp b/src/gpu/cl/ClTensor.cpp
index 0df07813e3..27422a4130 100644
--- a/src/gpu/cl/ClTensor.cpp
+++ b/src/gpu/cl/ClTensor.cpp
@@ -31,8 +31,7 @@ namespace gpu
 {
 namespace opencl
 {
-ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc)
-    : ITensorV2(ctx), _legacy_tensor()
+ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
 {
     ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::GpuOcl));
     _legacy_tensor = std::make_unique<CLTensor>();
@@ -43,7 +42,7 @@ void *ClTensor::map()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:map]: Backing tensor does not exist!");
         return nullptr;
@@ -57,7 +56,7 @@ StatusCode ClTensor::unmap()
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:unmap]: Backing tensor does not exist!");
         return StatusCode::RuntimeError;
diff --git a/src/gpu/cl/ClTensor.h b/src/gpu/cl/ClTensor.h
index 99d228c0b8..70184cd4bd 100644
--- a/src/gpu/cl/ClTensor.h
+++ b/src/gpu/cl/ClTensor.h
@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLTENSOR_H
 #define SRC_GPU_CLTENSOR_H
 
-#include "src/common/ITensorV2.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 
+#include "src/common/ITensorV2.h"
+
 namespace arm_compute
 {
 namespace gpu
@@ -54,7 +54,7 @@ public:
     void                 *map() override;
     StatusCode            unmap() override;
     arm_compute::ITensor *tensor() const override;
-    StatusCode import(void *handle, ImportMemoryType type) override;
+    StatusCode            import(void *handle, ImportMemoryType type) override;
 
 private:
     std::unique_ptr<CLTensor> _legacy_tensor;
@@ -63,4 +63,4 @@ private:
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLTENSOR_H */
-\ No newline at end of file
+#endif /* SRC_GPU_CLTENSOR_H */
diff --git a/src/gpu/cl/IClKernel.h b/src/gpu/cl/IClKernel.h
new file mode 100644
index 0000000000..4f07e9ad68
--- /dev/null
+++ b/src/gpu/cl/IClKernel.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICL_KERNEL_H
+#define ARM_COMPUTE_ICL_KERNEL_H
+
+#include "arm_compute/core/ITensorInfo.h"
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using IClKernel = arm_compute::ICLKernel;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ICL_KERNEL_H */
diff --git a/src/gpu/cl/IClOperator.h b/src/gpu/cl/IClOperator.h
new file mode 100644
index 0000000000..049bf05dc1
--- /dev/null
+++ b/src/gpu/cl/IClOperator.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICL_OPERATOR_H
+#define ARM_COMPUTE_ICL_OPERATOR_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/runtime/CL/ICLOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using IClOperator = experimental::ICLOperator;
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ICL_OPERATOR_H */
diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp
new file mode 100644
index 0000000000..a85296f7cd
--- /dev/null
+++ b/src/gpu/cl/kernels/ClActivationKernel.cpp
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+
+    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = {
+        ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+        ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+        ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
+    };
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) &&
+                                        (quantized_supported_activations.count(f_act) == 0),
+                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and "
+                                    "lower/upper bounded relu are supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+
+    // Checks performed when destination is configured
+    if ((dst != nullptr) && (dst->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClActivationKernel::ClActivationKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClActivationKernel::configure(const ClCompileContext &compile_context,
+                                   ITensorInfo            *src,
+                                   ITensorInfo            *dst,
+                                   ActivationLayerInfo     act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+
+    auto padding_info = get_padding_info({src, dst});
+
+    _run_in_place = (dst == nullptr) || (dst == src);
+
+    if (dst != nullptr)
+    {
+        // Destination auto inizialitation if not yet initialized
+        auto_init_if_empty(*dst, *src->clone());
+    }
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, (dst != nullptr) ? dst : nullptr, act_info));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+    const DataType dt      = src->data_type();
+    float          a_const = act_info.a();
+    float          b_const = act_info.b();
+
+    const ActivationLayerInfo::ActivationFunction f_act        = act_info.activation();
+    const bool                                    is_quantized = is_data_type_quantized(dt);
+    const bool perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::TANH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN");
+    build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
+    build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+
+    std::string kernel_name = std::string("activation_layer");
+
+    // Set quantization info build options
+    if (is_quantized)
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+
+        if (!perform_activation_in_float)
+        {
+            int a_const_int = 0;
+            int b_const_int = 0;
+
+            // Create quantized version of constants a, b if needed
+            switch (dt)
+            {
+                case DataType::QASYMM8:
+                {
+                    a_const_int = quantize_qasymm8(a_const, iq_info);
+                    b_const_int = quantize_qasymm8(b_const, iq_info);
+                }
+                break;
+                case DataType::QASYMM8_SIGNED:
+                {
+                    a_const_int = quantize_qasymm8_signed(a_const, iq_info);
+                    b_const_int = quantize_qasymm8_signed(b_const, iq_info);
+                }
+                break;
+                case DataType::QSYMM16:
+                {
+                    a_const_int = quantize_qsymm16(a_const, iq_info);
+                    b_const_int = quantize_qsymm16(b_const, iq_info);
+                }
+                break;
+                default:
+                    break;
+            }
+            build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int)));
+            build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
+        }
+        else
+        {
+            build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+            build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+        }
+
+        // Quantized value of 0 corresponds to the offset o1
+        build_opts.add_option(
+            ("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
+        build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                 "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
+
+        // Set correct kernel name
+        kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
+
+        // Set scale and offset of the source and destination if they have different quantization info
+        if (dst != nullptr)
+        {
+            const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+            if (iq_info != oq_info)
+            {
+                build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
+                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                         "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
+            }
+        }
+    }
+    else
+    {
+        // Set A, B constants in build options for float types
+        build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const)));
+        build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const)));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "activation_layer_";
+    _config_id += lower_string(string_from_data_type(dt));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(1));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
+    return Status{};
+}
+
+void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        if (!_run_in_place)
+        {
+            add_3D_tensor_argument(idx, dst, slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h
new file mode 100644
index 0000000000..ab7607bb82
--- /dev/null
+++ b/src/gpu/cl/kernels/ClActivationKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
+#define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the activation kernel. */
+class ClActivationKernel : public IClKernel
+{
+public:
+    ClActivationKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClActivationKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @note If the output tensor is a nullptr, the activation function will be performed in-place
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src             Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result
+     *                                 of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]      act_info        Activation layer information.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   ActivationLayerInfo     act_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClActivationKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    bool _run_in_place{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ACTIVATION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
new file mode 100644
index 0000000000..a853f6bc1b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst);
+
+    return Status{};
+}
+} // namespace
+
+ClBatchConcatenateKernel::ClBatchConcatenateKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            batch_offset,
+                                         ITensorInfo            *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
+
+    auto padding_info = get_padding_info({src, dst});
+
+    _batch_offset = batch_offset;
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    std::string kernel_name = "concatenate";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    win.set(3, Window::Dimension(0, src->tensor_shape()[3], 1));
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_";
+    _config_id += support::cpp11::to_string(3);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(batch_offset);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(3));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClBatchConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
+                                          unsigned int                    batch_offset,
+                                          const arm_compute::ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst));
+    return Status{};
+}
+
+void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window slice = window.first_slice_window_3D();
+
+    const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3];
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
new file mode 100644
index 0000000000..549576b628
--- /dev/null
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the batch concatenate kernel.
+ *  The src tensor will be concatenated into the destination tensor.
+ */
+class ClBatchConcatenateKernel : public IClKernel
+{
+public:
+    ClBatchConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClBatchConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor info. Data types supported: All.
+     * @param[in]     batch_offset    The offset on axis # 3.
+     * @param[in,out] dst             Destination tensor info. Data types supported: Same as @p src.
+     *
+     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
+     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
+     *
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClBatchConcatenateKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _batch_offset{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
new file mode 100644
index 0000000000..bbffcf55a3
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2016-2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_UNUSED(policy);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED,
+        DataType::QSYMM8_PER_CHANNEL, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16,
+        DataType::F32, DataType::S64, DataType::U64);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
+
+    // Validate in case of configured dst
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClCastKernel::ClCastKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClCastKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             ConvertPolicy           policy)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
+    set_shape_if_empty(*dst, src->tensor_shape());
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
+
+    auto padding_info = get_padding_info({src, dst});
+
+    // Get data sizes
+    const size_t src_size = data_size_from_type(src->data_type());
+    const size_t dst_size = data_size_from_type(dst->data_type());
+
+    // Get number of elements to process per iterations
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
+    // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
+    build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
+    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()),
+                             "-DIS_DATA_TYPE_FLOAT");
+    build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
+
+    // Create kernel
+    const std::string kernel_name = (src_size >= dst_size) ? "cast_down" : "cast_up";
+    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    // Collapse window
+    const Window &full_window      = window();
+    Window        collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ);
+    ICLKernel::configure_internal(collapsed_window);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(src->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+}
+
+Status ClCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
+    return Status{};
+}
+
+void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h
new file mode 100644
index 0000000000..07b0b61443
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCastKernel.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CAST_KERNEL_H
+#define ARM_COMPUTE_CL_CAST_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Casts a given tensor to a new type
+ *
+ * @note When casting between quantized types the scale and zeroPoint are ignored
+ */
+class ClCastKernel : public IClKernel
+{
+public:
+    ClCastKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCastKernel);
+    /** Set the src and dst of the kernel.
+     *
+     * Valid conversions src -> dst :
+     *
+     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
+     *   - U8  -> S8, U16, S16, U32, S32, F16, F32
+     *   - S8  -> U8, U16, S16, U32, S32, F16, F32
+     *   - U16 -> U8, S8, S16, U32, S32, F16, F32
+     *   - S16 -> U8, S8, U16, U32, S32, F16, F32
+     *   - U32 -> U8, S8, U16, S16, S32, F16, F32
+     *   - S64 -> U8, S8, U16, S16, U32, S32, F16, F32
+     *   - U64 -> U8, S8, U16, S16, U32, S32, F16, F32
+     *   - S32 -> U8, S8, U16, S16, U32, F16, F32
+     *   - F16 -> U8, S8, U16, S16, U32, S32, F32
+     *   - F32 -> U8, S8, U16, S16, U32, S32, F16
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/U64/S64/F16/F32.
+     * @param[out] dst             The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  policy          Conversion policy
+     */
+    void
+    configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCastKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CAST_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
new file mode 100644
index 0000000000..9972e07f05
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClCol2ImKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <cmath>
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src,
+                          const ITensorInfo *dst,
+                          const Size2D      &convolved_dims,
+                          unsigned int       num_groups)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+
+    // Checks performed when output is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW,
+                                        "Col2Im output's data layout must always be NCHW");
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups))
+                                 .set_data_layout(DataLayout::NCHW));
+
+    constexpr unsigned int num_elems_read_per_iteration = 8;
+
+    // Configure window
+    Window win = calculate_max_window(*src, Steps(num_elems_read_per_iteration));
+
+    // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one
+    AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration);
+    bool                   window_changed = update_window_and_padding(win, input_access);
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+ClCol2ImKernel::ClCol2ImKernel() : _convolved_dims()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClCol2ImKernel::configure(const CLCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &convolved_dims,
+                               unsigned int            num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims, num_groups));
+
+    _convolved_dims = convolved_dims;
+
+    const DataType data_type = src->data_type();
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size()));
+    build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(src->dimension(0)));
+    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
+    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
+
+    _kernel = create_kernel(compile_context, "col2im", build_opts.options());
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, dst, _convolved_dims, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "col2im_";
+    _config_id += lower_string(string_from_data_type(src->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(num_groups);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+}
+
+Status ClCol2ImKernel::validate(const ITensorInfo *src,
+                                const ITensorInfo *dst,
+                                const Size2D      &convolved_dims,
+                                unsigned int       num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
+    return Status{};
+}
+
+void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
+
+    bool is_collapsed     = false;
+    bool is_collapsed_out = false;
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window out_window;
+    out_window.use_tensor_dimensions(dst->info()->tensor_shape());
+
+    Window collapsed     = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed);
+    Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out);
+
+    ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out);
+
+    Window slice     = collapsed.first_slice_window_3D();
+    Window slice_out = collapsed_out.first_slice_window_4D();
+    do
+    {
+        // Set inputs
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_4D_tensor_argument(idx, dst, slice_out);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h
new file mode 100644
index 0000000000..34194aba01
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COL2IM_KERNEL_H
+#define ARM_COMPUTE_CL_COL2IM_KERNEL_H
+
+#include "arm_compute/core/Size2D.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the col2im reshaping kernel.
+ *
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref opencl::kernels::ClIm2ColKernel.
+ *
+ * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
+ *
+ * @f[
+ * \left( \begin{array}{ccccccccc}
+ * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccc}
+ * a0 & a1 & a2 \\
+ * a3 & a4 & a5 \\
+ * a6 & a7 & a8 \\
+ * \end{array} \right)
+ * @f]
+ */
+class ClCol2ImKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClCol2ImKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCol2ImKernel);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The input tensor info to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out] dst             The output tensor info. 3 lower dimensions represent a single output [width, height, OFM],
+     *                             while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
+     * @param[in]  convolved_dims  Output convolved dimensions.
+     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &convolved_dims,
+                   unsigned int            num_groups = 1);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClCol2ImKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    Size2D _convolved_dims;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_COL2IM_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
new file mode 100644
index 0000000000..85d3c3939c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context,
+                                                     const ITensorInfo      *src,
+                                                     ITensorInfo            *dst,
+                                                     const TensorShape      &original_src_shape,
+                                                     DataLayout              data_layout)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto initialisation if not yet initialized
+    auto_init_if_empty(*dst, *src->clone());
+
+    auto padding_info = get_padding_info({src, dst});
+
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
+
+    const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
+
+    const int width_idx   = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::WIDTH);
+    const int height_idx  = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::HEIGHT);
+    const int channel_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::CHANNEL);
+
+    const unsigned int num_elems_per_src_plane = original_src_shape[width_idx] * original_src_shape[height_idx];
+    const unsigned int num_channels            = original_src_shape[channel_idx];
+
+    const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_src_plane : num_channels;
+    const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_src_plane;
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+    build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1));
+    build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                      const ITensorInfo *dst,
+                                                      const TensorShape &original_src_shape,
+                                                      DataLayout         data_layout)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_src_shape.total_size_lower(3));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
+
+    // Checks performed when dst is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+
+void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, src, window);
+    add_2D_tensor_argument(idx, dst, window);
+    enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
new file mode 100644
index 0000000000..0ddb54561a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
+#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa.
+ *
+ * @note This function can be applied to the 2D weights used by a Fully Connected layer if:
+ *       - It follows a Convolution layer
+ *       - The data layout used by the network does not match the one the model has been trained in.
+ *
+ * @note This function assumes the weights are already reshaped (transposed)
+ */
+namespace opencl
+{
+namespace kernels
+{
+class ClConvertFullyConnectedWeightsKernel : public IClKernel
+{
+public:
+    ClConvertFullyConnectedWeightsKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClConvertFullyConnectedWeightsKernel);
+    /** Set the src and dst tensor.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  src                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All.
+     * @param[out] dst                The converted weights tensor info. Shape and Data Type: Same as @p src.
+     * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
+     * @param[in]  data_layout        The data layout the weights have been trained in.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp
new file mode 100644
index 0000000000..c80ef664f5
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCopyKernel.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClCopyKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+
+    // Validate dst if initialized
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        if (dst_window == nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst_window->shape());
+        }
+    }
+
+    return Status{};
+}
+
+} // namespace
+
+ClCopyKernel::ClCopyKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClCopyKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Window                 *dst_window)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window));
+
+    auto padding_info = get_padding_info({src, dst});
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, *src);
+
+    // Configure window
+    const unsigned int vec_size_x = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+    const Window win_config = calculate_max_window(*src, Steps(vec_size_x));
+
+    if (dst_window != nullptr)
+    {
+        _has_dst_window                = true;
+        _dst_window                    = Window(*dst_window);
+        const int  width_x             = dst_window->num_iterations(0);
+        const int  vec_size_x_leftover = width_x % vec_size_x;
+        const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
+
+        if (multi_access_x)
+        {
+            _dst_window.set(Window::DimX,
+                            Window::Dimension(dst_window->x().start(),
+                                              ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
+        }
+
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
+    }
+    else
+    {
+        const int width_x             = src->tensor_shape().x();
+        const int vec_size_x_leftover = width_x % vec_size_x;
+
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
+    }
+
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+
+    // Build kernel
+    _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
+
+    // Validate and set the window
+    ICLKernel::configure_internal(win_config);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status
+ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window));
+
+    return Status{};
+}
+
+void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window slice;
+
+    if (_has_dst_window)
+    {
+        slice            = window.first_slice_window_3D();
+        Window out_slice = _dst_window.first_slice_window_3D();
+        do
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, src, slice);
+            add_3D_tensor_argument(idx, dst, out_slice);
+            enqueue(queue, *this, slice, lws_hint());
+        } while (window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
+    }
+    else
+    {
+        Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+        slice            = collapsed.first_slice_window_3D();
+        do
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, src, slice);
+            add_3D_tensor_argument(idx, dst, slice);
+            enqueue(queue, *this, slice, lws_hint());
+        } while (collapsed.slide_window_slice_3D(slice));
+    }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h
new file mode 100644
index 0000000000..f915bf672d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCopyKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COPY_KERNEL_H
+#define ARM_COMPUTE_CL_COPY_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform a copy between two tensors */
+class ClCopyKernel : public IClKernel
+{
+public:
+    ClCopyKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCopyKernel);
+    /** Initialize the kernel's src, dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: All.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCopyKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    Window _dst_window{};
+    bool   _has_dst_window{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_COPY_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp
new file mode 100644
index 0000000000..0c503e13fc
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCropKernel.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClCropKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+ClCropKernel::ClCropKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClCropKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Coordinates2D           start,
+                             Coordinates2D           end,
+                             uint32_t                batch_index,
+                             float                   extrapolation_value,
+                             Window                 *dst_window)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window));
+
+    _start               = start;
+    _batch_index         = batch_index;
+    _extrapolation_value = extrapolation_value;
+
+    const uint32_t vec_size_x = 4;
+    // Create and update the window (if needed)
+    Window win = calculate_max_window(*dst);
+
+    if (dst_window != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window);
+        win = *dst_window;
+    }
+
+    const uint32_t dst_width_x    = win.num_iterations(0);
+    const bool     multi_access_x = dst_width_x >= vec_size_x;
+    const bool     remainder_x    = dst_width_x % vec_size_x > 0;
+
+    if (multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
+    build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
+    _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
+}
+
+Status ClCropKernel::validate(const ITensorInfo *src,
+                              const ITensorInfo *dst,
+                              Coordinates2D      start,
+                              Coordinates2D      end,
+                              uint32_t           batch_index,
+                              float              extrapolation_value,
+                              Window            *dst_window)
+{
+    ARM_COMPUTE_UNUSED(extrapolation_value, dst_window);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) ||
+        end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
+    ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3));
+    if (dst_window != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1);
+    }
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst->num_dimensions() > 3);
+    }
+    return Status{};
+}
+
+void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window in_slice = Window();
+    in_slice.use_tensor_dimensions(src->info()->tensor_shape());
+    in_slice.set(Window::DimX,
+                 Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()),
+                                   window.x().step()));
+    in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
+
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, src, in_slice);
+    add_3D_tensor_argument(idx, dst, window);
+    add_argument(idx, _start.x);
+    add_argument(idx, _start.y);
+    enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h
new file mode 100644
index 0000000000..506262608c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClCropKernel.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CROP_KERNEL_H
+#define ARM_COMPUTE_CL_CROP_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform a copy between two tensors */
+class ClCropKernel : public IClKernel
+{
+public:
+    ClCropKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCropKernel);
+    /** Configure kernel
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context     The compile context to be used.
+     * @param[in]  src                 Source tensor info. Data type supported: All. Data layouts supported: NHWC.
+     * @param[out] dst                 Destination tensor info. Data type supported: F32
+     * @param[in]  start               Coordinates of where to start cropping the image.
+     * @param[in]  end                 Coordinates of where to end cropping the image.
+     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p src.
+     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCropKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    Coordinates2D _start{};
+    uint32_t      _batch_index{};
+    float         _extrapolation_value{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CROP_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
new file mode 100644
index 0000000000..ec44d88f01
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(2) + depth_offset > dst->dimension(2));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, src, dst);
+
+    return Status{};
+}
+} // namespace
+
+ClDepthConcatenateKernel::ClDepthConcatenateKernel() : _depth_offset(0)
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            depth_offset,
+                                         ITensorInfo            *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
+
+    auto padding_info = get_padding_info({src, dst});
+
+    _depth_offset = depth_offset;
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0));
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    std::string kernel_name = "concatenate";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    win.set(Window::DimZ, Window::Dimension(0, src->tensor_shape().z(), 1));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClDepthConcatenateKernel::validate(const arm_compute::ITensorInfo *src,
+                                          unsigned int                    depth_offset,
+                                          const arm_compute::ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst));
+    return Status{};
+}
+
+void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window slice = window.first_slice_window_3D();
+
+    const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2];
+
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+    _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
new file mode 100644
index 0000000000..539f010303
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the depth concatenate kernel.
+ *  The src tensor will be concatenated into the dst tensor.
+ */
+class ClDepthConcatenateKernel : public IClKernel
+{
+public:
+    ClDepthConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDepthConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]     depth_offset    The offset on the Z axis.
+     * @param[in,out] dst             Destination tensor info. Data types supported: Same as @p src.
+     *
+     * @note: The dst tensor's low two dimensions can't be smaller than the src one's.
+     * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
+     *
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClDepthConcatenateKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _depth_offset;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
new file mode 100644
index 0000000000..53429ab1aa
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
@@ -0,0 +1,162 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
+
+    if (dst->tensor_shape().total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClDequantizeKernel::ClDequantizeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
+
+    auto padding_info = get_padding_info({src, dst});
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    const int  vec_size_x     = 16 / dst->element_size();
+    const int  output_width_x = dst->tensor_shape().x();
+    const bool multi_access_x = (output_width_x / vec_size_x > 0);
+
+    const bool  is_quantized_per_channel = is_data_type_quantized_per_channel(src->data_type());
+    std::string kernel_name              = "dequantization_layer";
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    if (!is_quantized_per_channel)
+    {
+        const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
+        const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
+        build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
+        build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
+    }
+    else
+    {
+        kernel_name += "_per_channel";
+        kernel_name += src->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc";
+    }
+
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
+
+    // Create kernel name
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst);
+    if (multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
+
+    // Collapse windo
+    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4)
+                                                 : window.collapse_if_possible(ICLKernel::window(), 3);
+    Window slice      = new_window.first_slice_window_3D();
+
+    if (is_quantized_per_channel)
+    {
+        unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
+        _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (new_window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.h b/src/gpu/cl/kernels/ClDequantizeKernel.h
new file mode 100644
index 0000000000..a32f506c9a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDequantizeKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
+#define ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the dequantization layer kernel. */
+class ClDequantizeKernel : public IClKernel
+{
+public:
+    ClDequantizeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizeKernel);
+    /** Initialise the kernel's input and output
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst             Destination tensor info. Data types supported: F16/F32.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClDequantizeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
new file mode 100644
index 0000000000..7cf1958c1b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+
+    const DataLayout data_layout = src->data_layout();
+    const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true,
+                                    "Export to CLImage is not supported for the input tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true,
+                                    "Export to CLImage is not supported for the output tensor");
+
+    if (data_layout == DataLayout::NCHW)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx),
+                                        "Weights should have same width and height");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3,
+                                        "Strides larger than 3 not supported for 1x1 convolution.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 ||
+                                         weights->dimension(width_idx) == 9) &&
+                                            std::get<0>(conv_info.stride()) > 2,
+                                        "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled(), "Fused activation is not supported for NCHW layout");
+
+        if (is_data_type_quantized(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+                "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5,
+                "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
+        }
+    }
+
+    if (data_layout == DataLayout::NHWC)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()),
+                                        "Fused activation in NHWC is only supported for floating point.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                        "M0 can only be greater than 0 and less than or equal to 8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                            desc.n0 != 16,
+                                        "N0 can only be: 1, 2, 3, 4, 8, and 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                            desc.k0 != 16,
+                                        "K0 can only be: 1, 2, 3, 4, 8, and 16");
+        if (desc.export_weights_to_cl_image)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+                                            "K0 can only be: 4, 8, and 16");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights),
+                                            "Export to CLImage is not supported for this weight configuration");
+        }
+    }
+
+    if (biases != nullptr)
+    {
+        if (is_data_type_quantized_asymmetric(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // Checks performed when dst is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    const auto data_type = src->data_type();
+    if (is_data_type_quantized(data_type))
+    {
+        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    }
+    return Status{};
+}
+} // namespace
+
+ClDirectConv2dKernel::ClDirectConv2dKernel()
+{
+    _type = CLKernelType::DIRECT;
+}
+
+void ClDirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                     ITensorInfo                       *src,
+                                     ITensorInfo                       *weights,
+                                     ITensorInfo                       *biases,
+                                     ITensorInfo                       *dst,
+                                     const PadStrideInfo               &conv_info,
+                                     const ActivationLayerInfo         &act_info,
+                                     const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+    // Perform validation
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
+
+    const int conv_stride_x = std::get<0>(conv_info.stride());
+    const int conv_stride_y = std::get<1>(conv_info.stride());
+
+    _data_layout = src->data_layout();
+    _conv_info   = conv_info;
+
+    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int kernel_size = weights->dimension(width_idx);
+    const DataType     data_type   = src->data_type();
+
+    const GPUTarget gpu_target                         = get_target();
+    unsigned int    _num_elems_processed_per_iteration = 0;
+
+    // Get dst shape
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
+
+    // Configure kernel window
+    Window win;
+    if (_data_layout == DataLayout::NHWC)
+    {
+        output_shape.collapse(2U, 1U);
+        const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
+        const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1]);
+
+        // Create window and update padding
+        win = calculate_max_window(output_shape, Steps(n0, m0));
+    }
+    else if (_data_layout == DataLayout::NCHW)
+    {
+        _num_elems_processed_per_iteration = 1u;
+        win                                = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
+    }
+
+    ICLKernel::configure_internal(win);
+
+    std::stringstream kernel_name;
+    CLBuildOptions    build_options;
+
+    if (_data_layout == DataLayout::NHWC)
+    {
+        kernel_name << "direct_convolution_nhwc";
+
+        const unsigned int n0               = win.x().step();
+        const unsigned int m0               = win.y().step();
+        const unsigned int k0               = adjust_vec_size(desc.k0, src->dimension(channel_idx));
+        const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0;
+        const unsigned int pad_left         = conv_info.pad_left();
+        const unsigned int pad_top          = conv_info.pad_top();
+
+        _export_weights_to_cl_image = desc.export_weights_to_cl_image;
+        _export_input_to_cl_image   = desc.export_input_to_cl_image;
+        _export_output_to_cl_image  = desc.export_output_to_cl_image;
+
+        // Update the padding for the weights tensor if we can export to cl_image
+        if (_export_weights_to_cl_image)
+        {
+            gemm::update_padding_for_cl_image(weights);
+        }
+
+        if (_export_output_to_cl_image)
+        {
+            gemm::update_padding_for_cl_image(dst);
+        }
+
+        if (_export_input_to_cl_image)
+        {
+            gemm::update_padding_for_cl_image(src);
+        }
+
+        if (biases != nullptr)
+        {
+            build_options.add_option(std::string("-DHAS_BIAS"));
+            build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
+        }
+
+        // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+        const auto act_function  = act_info.activation();
+        const auto dst_data_type = dst->data_type();
+
+        if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+            (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+             act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+            (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+        {
+            // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+            // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+            build_options.add_option("-cl-unsafe-math-optimizations");
+        }
+        else
+        {
+            build_options.add_option("-cl-fast-relaxed-math");
+        }
+
+        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE",
+                                         "-DSRC_TENSOR_TYPE=BUFFER");
+        build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
+        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
+        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(2)));
+        build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
+        build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
+        build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
+        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE",
+                                         "-DDST_TENSOR_TYPE=BUFFER");
+        build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
+        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE",
+                                         "-DWEI_TENSOR_TYPE=BUFFER");
+        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
+        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
+        build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
+        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+        build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+        build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+        build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+        build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+        build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
+        build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
+
+        if (is_data_type_quantized(data_type))
+        {
+            const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+            const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+            const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+            PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+            int        zero_value_s32;
+            zero_value.get(zero_value_s32);
+
+            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+            int   output_multiplier = 0;
+            int   output_shift      = 0;
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+            build_options.add_option("-DIS_QUANTIZED");
+            build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+            build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+        }
+        else
+        {
+            build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+            build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
+            build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
+            build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
+            build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+        }
+
+        if (compile_context.get_ddk_version() >= 30)
+        {
+            build_options.add_option("-fregister-allocation=64");
+        }
+    }
+    else
+    {
+        _export_weights_to_cl_image = false;
+
+        kernel_name << "direct_convolution_nchw";
+        build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS"));
+        build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
+        build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
+        build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
+        build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+        build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+        build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+        build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+        build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
+        build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
+        build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
+        build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
+        build_options.add_option(
+            std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+        build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
+        build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE_LEFTOVER=" +
+                        support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
+
+        if (is_data_type_quantized(data_type))
+        {
+            const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+            const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+            const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+            float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+            int   output_multiplier = 0;
+            int   output_shift      = 0;
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+            build_options.add_option("-DIS_QUANTIZED");
+            build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+            build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
+            build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
+            build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+            build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+            build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+        }
+    }
+
+    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
+
+    // Set config_id for enabling LWS tuning
+    // config_id should include the variables used to parameterize the kernel
+    _config_id = kernel_name.str();
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(kernel_size);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(border_size().left);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(border_size().top);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(border_size().right);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(border_size().bottom);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_stride_x);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_stride_y);
+    // SRC_CHANNELS, SRC_WIDTH, SRC_HEIGHT
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(channel_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(width_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(height_idx));
+    _config_id += "_";
+    // DST_CHANNELS, DST_WIDTH, DST_HEIGHT
+    _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(width_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(height_idx));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(_data_layout));
+}
+
+Status ClDirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                      const ITensorInfo                 *weights,
+                                      const ITensorInfo                 *biases,
+                                      const ITensorInfo                 *dst,
+                                      const PadStrideInfo               &conv_info,
+                                      const ActivationLayerInfo         &act_info,
+                                      const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
+    return Status{};
+}
+
+void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if (_data_layout == DataLayout::NHWC)
+    {
+        cl::Image2D weights_cl_image;
+        cl::Image2D output_cl_image;
+        cl::Image2D input_cl_image;
+
+        if (_export_weights_to_cl_image)
+        {
+            // Export tensor to cl_image
+            weights_cl_image = create_image2d_from_tensor(weights, CLImage2DType::ReadOnly);
+        }
+
+        if (_export_output_to_cl_image)
+        {
+            // Export tensor to cl_image
+            output_cl_image = create_image2d_from_tensor(dst, CLImage2DType::WriteOnly);
+        }
+
+        if (_export_input_to_cl_image)
+        {
+            // Export tensor to cl_image
+            input_cl_image = create_image2d_from_tensor(src, CLImage2DType::ReadOnly);
+        }
+
+        unsigned int idx = 0;
+        if (_export_input_to_cl_image)
+        {
+            _kernel.setArg(idx++, input_cl_image);
+        }
+        add_4d_tensor_nhwc_argument(idx, src);
+        if (_export_output_to_cl_image)
+        {
+            _kernel.setArg(idx++, output_cl_image);
+        }
+        add_4d_tensor_nhwc_argument(idx, dst);
+        if (_export_weights_to_cl_image)
+        {
+            _kernel.setArg(idx++, weights_cl_image);
+        }
+        add_4d_tensor_nhwc_argument(idx, weights);
+        if (biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, biases, slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    else
+    {
+        unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
+        add_3D_tensor_argument(idx1, weights, slice);
+
+        if (biases != nullptr)
+        {
+            Window slice_biases;
+            slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
+            add_1D_tensor_argument(idx1, biases, slice_biases);
+        }
+
+        _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3]));
+
+        do
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, src, slice);
+            add_3D_tensor_argument(idx, dst, slice);
+            enqueue(queue, *this, slice, lws_hint());
+        } while (window.slide_window_slice_3D(slice));
+    }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
new file mode 100644
index 0000000000..c934c825ca
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declaration
+struct DirectConvComputeKernelInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the  direct convolution kernel. */
+class ClDirectConv2dKernel : public IClKernel
+{
+public:
+    ClDirectConv2dKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel);
+    /** Set the src, weights, biases and dst tensors info.
+     *
+     * @note: Due to set_valid_region() in NCHW, src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed.
+     *
+     * @note: DirectConvolution only works in the following configurations for the NCHW data layout:
+     *        1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3
+     *        3x3 convolution with stride_x = 1/2, stride_y = 1/2
+     *        5x5 convolution with stride_x = 1/2, stride_y = 1/2
+     *        9x9 convolution with stride_x = 1/2, stride_y = 1/2
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             The 3rd dimension must be the same as the src's volume 3rd dimension.
+     *                             Data type supported:Same as @p src.
+     * @param[in]  biases          Biases tensor info. Biases are 1D tensor with dimension [OFM].
+     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+     * @param[out] dst             Output tensor info.
+     *                             The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
+     * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct convolution kernel. For NCHW, this parameter is ignored.
+     */
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    DataLayout    _data_layout{};
+    PadStrideInfo _conv_info{};
+    bool          _export_weights_to_cl_image{false};
+    bool          _export_output_to_cl_image{false};
+    bool          _export_input_to_cl_image{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
new file mode 100644
index 0000000000..8002520a87
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClDirectConv3dKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
+{
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
+
+    // When fusing activation, same workaround introduced for COMPMID-5324 may be necessary
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON(conv3d_info.dilation != Size3D(1U, 1U, 1U));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 5, "Weights can be at most 5 dimensional");
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) >
+                                (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) >
+                                (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) >
+                                (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
+
+    if (src2 != nullptr)
+    {
+        if (is_data_type_quantized(src0->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // Checks performed when dst is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src1->dimension(0), "Weights and dst OFMs should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClDirectConv3dKernel::ClDirectConv3dKernel()
+{
+    _type = CLKernelType::DIRECT;
+}
+
+void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *src0,
+                                     const ITensorInfo      *src1,
+                                     const ITensorInfo      *src2,
+                                     ITensorInfo            *dst,
+                                     const Conv3dInfo       &conv3d_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    // Perform validation
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, conv3d_info));
+
+    // Create window and update padding
+    const DataType data_type      = src0->data_type();
+    const size_t   src_width      = src0->dimension(1);
+    const size_t   src_height     = src0->dimension(2);
+    const size_t   src_depth      = src0->dimension(3);
+    const size_t   src_channels   = src0->dimension(0);
+    const size_t   dst_width      = dst->dimension(1);
+    const size_t   dst_height     = dst->dimension(2);
+    const size_t   dst_depth      = dst->dimension(3);
+    const size_t   dst_channels   = dst->dimension(0);
+    const size_t   weights_width  = src1->dimension(2);
+    const size_t   weights_height = src1->dimension(3);
+    const size_t   weights_depth  = src1->dimension(4);
+    const size_t   pad_left       = conv3d_info.padding.left;
+    const size_t   pad_top        = conv3d_info.padding.top;
+    const size_t   pad_front      = conv3d_info.padding.front;
+    const size_t   conv_stride_x  = conv3d_info.stride.x();
+    const size_t   conv_stride_y  = conv3d_info.stride.y();
+    const size_t   conv_stride_z  = conv3d_info.stride.z();
+
+    const size_t n0               = std::min(dst->dimension(0), static_cast<size_t>(4u));
+    const size_t m0               = (dst->tensor_shape()[0] > 16) ? ((data_type == DataType::F32) ? 2U : 4U) : 1U;
+    const size_t k0               = adjust_vec_size(8u, src0->dimension(0));
+    const size_t partial_store_n0 = dst->dimension(0) % n0;
+
+    CLBuildOptions build_options;
+    build_options.add_option("-cl-fast-relaxed-math");
+    build_options.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+    build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+    build_options.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src_depth));
+    build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src_channels));
+    build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst_width));
+    build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst_height));
+    build_options.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst_depth));
+    build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst_channels));
+    build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights_width));
+    build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights_height));
+    build_options.add_option("-DWEI_DEPTH=" + support::cpp11::to_string(weights_depth));
+    build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+    build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+    build_options.add_option("-DSTRIDE_Z=" + support::cpp11::to_string(conv_stride_z));
+    build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+    build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+    build_options.add_option("-DPAD_FRONT=" + support::cpp11::to_string(pad_front));
+    build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+    build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    if (src2 != nullptr)
+    {
+        build_options.add_option(std::string("-DHAS_BIAS"));
+        build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type())));
+    }
+
+    if (is_data_type_quantized(data_type))
+    {
+        const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform();
+        const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+        PixelValue zero_value = PixelValue(0, src0->data_type(), src0->quantization_info());
+        int        zero_value_s32;
+        zero_value.get(zero_value_s32);
+
+        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+        build_options.add_option("-DIS_QUANTIZED");
+        build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+        build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+        build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+        build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+        build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+        build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+    }
+    else
+    {
+        build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::F32));
+        build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
+        build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
+        build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
+        build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
+    }
+
+    std::string kernel_name = "direct_convolution3d_ndhwc";
+    _kernel                 = create_kernel(compile_context, kernel_name, build_options.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(n0, m0));
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(weights_width);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(weights_height);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(weights_depth);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_stride_x);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_stride_y);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_stride_z);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst_width);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst_height);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst_channels);
+}
+
+Status ClDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      const ITensorInfo *dst,
+                                      const Conv3dInfo  &conv3d_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv3d_info));
+    return Status{};
+}
+
+void ClDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+    slice.set(Window::DimY, Window::Dimension(0,
+                                              ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) *
+                                                                   dst->info()->dimension(3),
+                                                               slice.y().step()),
+                                              slice.y().step()));
+    slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(4), 1));
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, src, slice);
+    add_4D_tensor_argument(idx, dst, slice);
+    add_4D_tensor_argument(idx, weights, slice);
+    if (biases != nullptr)
+    {
+        add_1D_tensor_argument(idx, biases, slice);
+    }
+    enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
new file mode 100644
index 0000000000..cb7509d8fa
--- /dev/null
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV3D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV3D_KERNEL_H
+
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+struct Conv3dInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the  direct convolution 3d kernel. */
+class ClDirectConv3dKernel : public IClKernel
+{
+public:
+    /** Construtor */
+    ClDirectConv3dKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClDirectConv3dKernel(const ClDirectConv3dKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClDirectConv3dKernel &operator=(const ClDirectConv3dKernel &) = delete;
+    /** Default move constructor */
+    ClDirectConv3dKernel(ClDirectConv3dKernel &&) = default;
+    /** Default move assignment operator */
+    ClDirectConv3dKernel &operator=(ClDirectConv3dKernel &&) = default;
+    /** Set the src, weights, biases and dst tensors info.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     * |QASYMM8        |QASYMM8        |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
+     *                             while every optional dimension from 5 and above represent a batch of srcs.
+     * @param[in]  src1            Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_w, kernel_h, kernel_d].
+     * @param[in]  src2            Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     * @param[out] dst             Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
+     * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv3dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
new file mode 100644
index 0000000000..cdb3527a92
--- /dev/null
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
@@ -0,0 +1,603 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Validate.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <map>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+constexpr unsigned int vector_size_byte_opencl = 16;
+
+std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},     {ArithmeticOperation::SUB, "SUB"},
+    {ArithmeticOperation::DIV, "DIV"},     {ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF"},
+    {ArithmeticOperation::MIN, "MIN"},     {ArithmeticOperation::MAX, "MAX"},
+    {ArithmeticOperation::POWER, "POWER"}, {ArithmeticOperation::PRELU, "PRELU"},
+};
+
+std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},
+    {ArithmeticOperation::SUB, "SUB"},
+};
+
+std::string
+generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+{
+    std::string config_id;
+    // Set config_id for enabling LWS tuning
+    config_id = kernel_name;
+    config_id += "_";
+    config_id += lower_string(string_from_data_type(src1.data_type()));
+    config_id += "_";
+    config_id += support::cpp11::to_string(dst.dimension(0));
+    config_id += "_";
+    config_id += support::cpp11::to_string(dst.dimension(1));
+    return config_id;
+}
+
+Status validate_in_place_output_shape(const bool         in_place,
+                                      const bool         src1_in_place,
+                                      const ITensorInfo &src1,
+                                      const ITensorInfo &src2,
+                                      const ITensorInfo &dst,
+                                      const TensorShape &out_shape)
+{
+    if (in_place)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
+            "Wrong shape for dst, cannot do in_place calculation");
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for dst");
+    }
+    return Status{};
+}
+
+Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1,
+                                                          const ITensorInfo &src2,
+                                                          const ITensorInfo &dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
+
+    // Check whether it is in_place calculation
+    const bool in_place      = (&src1 == &dst) || (&src2 == &dst);
+    const bool src1_in_place = in_place && (&src1 == &dst);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+    }
+
+    return Status{};
+}
+
+Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::F16, DataType::F32, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+
+    // Check whether it is in_place calculation
+    const bool in_place      = (src1 == dst) || (src2 == dst);
+    const bool src1_in_place = in_place && (src1 == dst);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured dst
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
+    }
+
+    return Status{};
+}
+
+Status
+validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
+
+    if (is_data_type_quantized_symmetric(src1.data_type()))
+    {
+        const int32_t in1_offset = src1.quantization_info().uniform().offset;
+        const int32_t in2_offset = src2.quantization_info().uniform().offset;
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_offset != 0, "For quantized symmetric, offset must be zero");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero");
+    }
+
+    // Check whether it is in_place calculation
+    const bool in_place      = (&src1 == &dst) || (&src2 == &dst);
+    const bool src1_in_place = in_place && (&src1 == &dst);
+
+    const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+
+        if (is_data_type_quantized_symmetric(dst.data_type()))
+        {
+            const int32_t offset = dst.quantization_info().uniform().offset;
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
+        }
+    }
+    return Status{};
+}
+
+CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1,
+                                                            const ITensorInfo &src2,
+                                                            const ITensorInfo &dst,
+                                                            const std::string &operation_string)
+{
+    CLBuildOptions build_opts;
+
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type()));
+    build_opts.add_option("-DVEC_SIZE_IN1=" +
+                          support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN2=" +
+                          support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DOP=" + operation_string);
+    if (is_data_type_quantized(src1.data_type()))
+    {
+        const UniformQuantizationInfo iq1info = src1.quantization_info().uniform();
+        const UniformQuantizationInfo iq2info = src2.quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo  = dst.quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(iq1info.offset));
+        build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(iq2info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1info.scale));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
+    }
+    build_opts.add_option_if(src1.data_type() == DataType::S32, "-DS32");
+
+    // Check whether it is in_place calculation
+    const bool in_place      = (&src1 == &dst) || (&src2 == &dst);
+    const bool src1_in_place = in_place && (&src1 == &dst);
+    build_opts.add_option_if(in_place, "-DIN_PLACE");
+    build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
+
+    return build_opts;
+}
+
+std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst)
+{
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
+    return std::make_pair(Status{}, win);
+}
+
+std::pair<Status, Window>
+validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const TensorShape &out_shape = broadcast_pair.first;
+
+    auto_init_if_empty(dst, out_shape, 1, src1.data_type());
+
+    return configure_window_arithmetic_common(dst);
+}
+
+std::pair<Status, Window>
+validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const TensorShape &out_shape = broadcast_pair.first;
+
+    set_shape_if_empty(dst, out_shape);
+    set_data_type_if_unknown(dst, DataType::U8);
+
+    return configure_window_arithmetic_common(dst);
+}
+
+std::pair<Status, Window>
+validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const TensorShape &out_shape = broadcast_pair.first;
+
+    auto_init_if_empty(dst, out_shape, 1, src1.data_type());
+
+    return configure_window_arithmetic_common(dst);
+}
+} // namespace
+
+ClElementwiseKernel::ClElementwiseKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context,
+                                           ITensorInfo            *src1,
+                                           ITensorInfo            *src2,
+                                           ITensorInfo            *dst)
+{
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(*src1, *src2, *dst);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    std::string kernel_name = "elementwise_operation_" + name();
+    if (is_data_type_quantized(src1->data_type()))
+    {
+        kernel_name += "_quantized";
+    }
+
+    // Set kernel build options
+    CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst);
+    if (_act_info.enabled())
+    {
+        build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
+        build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
+        build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(_act_info.b()));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    ICLKernel::configure_internal(win_config.second);
+
+    _config_id = generate_id_for_tuning(kernel_name, *src1, *dst);
+}
+
+void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
+
+    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+    const TensorShape &out_shape = dst->info()->tensor_shape();
+
+    bool       can_collapse = true;
+    const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice      = collapsed.first_slice_window_3D();
+    Window slice_src1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_src2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+    // Check whether it is in_place calculation
+    const bool in_place = (src_0 == dst) || (src_1 == dst);
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src_0, slice_src1);
+        add_3D_tensor_argument(idx, src_1, slice_src2);
+        if (!in_place)
+        {
+            add_3D_tensor_argument(idx, dst, slice);
+        }
+
+        enqueue(queue, *this, slice, lws_hint());
+        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
+        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2));
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+
+/** Logical binary */
+
+void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context,
+                                      LogicalOperation        op,
+                                      ITensorInfo            *src1,
+                                      ITensorInfo            *src2,
+                                      ITensorInfo            *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst));
+    _op = op;
+    configure_common(compile_context, src1, src2, dst);
+}
+
+Status ClLogicalBinaryKernel::validate(LogicalOperation   op,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone())
+            .first);
+
+    return Status{};
+}
+
+std::string ClLogicalBinaryKernel::name()
+{
+    switch (_op)
+    {
+        case LogicalOperation::And:
+            return "AND";
+        case LogicalOperation::Or:
+            return "OR";
+        case LogicalOperation::Not:
+        /* fall through */
+        default:
+            ARM_COMPUTE_ASSERT(true);
+    }
+    return "";
+}
+
+std::pair<Status, Window>
+ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+    return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst);
+}
+
+CLBuildOptions
+ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+{
+    // The arithmetic utility functions can be share
+    return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
+}
+
+std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                          const ITensorInfo &src1,
+                                                          const ITensorInfo &dst)
+{
+    return generate_id_for_tuning_common(kernel_name, src1, dst);
+}
+
+/** Arithmetic operations with saturation*/
+void ClSaturatedArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                            ArithmeticOperation        op,
+                                            ITensorInfo               *input1,
+                                            ITensorInfo               *input2,
+                                            ITensorInfo               *output,
+                                            const ConvertPolicy       &policy,
+                                            const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info));
+    auto padding_info = get_padding_info({input1, input2, output});
+
+    _policy   = policy;
+    _op       = op;
+    _act_info = act_info;
+    configure_common(compile_context, input1, input2, output);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation        op,
+                                             const ITensorInfo         *input1,
+                                             const ITensorInfo         *input2,
+                                             const ITensorInfo         *output,
+                                             const ConvertPolicy       &policy,
+                                             const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(op, policy);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone())
+            .first);
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
+
+    return Status{};
+}
+
+std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1,
+                                                                                     ITensorInfo &input2,
+                                                                                     ITensorInfo &output)
+{
+    return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
+}
+
+CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1,
+                                                                   const ITensorInfo &input2,
+                                                                   const ITensorInfo &output)
+{
+    const bool has_float_out = is_data_type_float(output.data_type());
+    auto       build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
+    build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE");
+    return build_options;
+}
+
+std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                                const ITensorInfo &input1,
+                                                                const ITensorInfo &output)
+{
+    auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
+    config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
+    config_id += lower_string(string_from_data_layout(input1.data_layout()));
+    return config_id;
+}
+
+std::string ClSaturatedArithmeticKernel::name()
+{
+    return supported_sat_arithmetic_ops[_op];
+}
+
+/** Arithmetic operations*/
+void ClArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                   ArithmeticOperation        op,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info));
+    auto padding_info = get_padding_info({src1, src2, dst});
+
+    _op       = op;
+    _act_info = act_info;
+    configure_common(compile_context, src1, src2, dst);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClArithmeticKernel::validate(ArithmeticOperation        op,
+                                    const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+    if (op == ArithmeticOperation::DIV)
+    {
+        // Partial integer support S32/F32/F16
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+    }
+    else if (op == ArithmeticOperation::POWER)
+    {
+        // Power operators doesn't support integer arithmetic
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone())
+                .first);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
+
+    return Status{};
+}
+std::pair<Status, Window>
+ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+{
+    if (_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
+    {
+        // Division and Power operators don't support integer arithmetic
+        return validate_and_configure_window_for_division(src1, src2, dst);
+    }
+    else
+    {
+        return validate_and_configure_window_for_arithmetic_operators(src1, src2, dst);
+    }
+}
+
+CLBuildOptions
+ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+{
+    return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
+}
+std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                       const ITensorInfo &src1,
+                                                       const ITensorInfo &dst)
+{
+    return generate_id_for_tuning_common(kernel_name, src1, dst);
+}
+
+std::string ClArithmeticKernel::name()
+{
+    return supported_arithmetic_ops[_op];
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h
new file mode 100644
index 0000000000..73e54542b2
--- /dev/null
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.h
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/core/KernelTypes.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for an element-wise operation kernel
+ *
+ * Element-wise operation is computed by:
+ * @f[ dst(x,y) = OP(src1(x,y), src2(x,y))@f]
+ *
+ * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead.
+ *
+ */
+class ClElementwiseKernel : public IClKernel
+{
+public:
+    ClElementwiseKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementwiseKernel);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+protected:
+    /** The name of the operation */
+    virtual std::string name() = 0;
+
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in] src1 First source tensor info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32.
+     * @param[in] src2 Second source tensor info. Data types supported: same as @p src1.
+     * @param[in] dst  Destination tensor info. Data types supported: same as @p src1.
+     *
+     * @return a pair of Status and Window
+     */
+    virtual std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
+
+    /** Generate the build options for the specific kernel
+     *
+     * @reutrn a CLBuildOptions struct
+     */
+    virtual CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
+
+    /** Generate the identifier for tuning
+     *
+     * @reutrn a string
+     */
+    virtual std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
+
+    /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
+     *
+     */
+    void
+    configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+
+    ActivationLayerInfo _act_info{};
+};
+
+class ClLogicalBinaryKernel : public ClElementwiseKernel
+{
+public:
+    ClLogicalBinaryKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogicalBinaryKernel);
+    /** Function to configure kernel
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] op              Logical binary operation to be executed.
+     * @param[in] src1            First source tensor info. Data types supported: U8.
+     * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
+     * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   LogicalOperation        op,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClLogicalBinaryKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+
+private:
+    // Inherited methods overridden:
+    std::string name() override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+
+    LogicalOperation _op{LogicalOperation::Unknown};
+};
+
+/** Addition operation */
+class ClSaturatedArithmeticKernel : public ClElementwiseKernel
+{
+public:
+    ClSaturatedArithmeticKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSaturatedArithmeticKernel);
+    /** Static function to check if given info will lead to a valid configuration of @ref ClSaturatedArithmeticKernel
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] op              Arithmetic operation to be executed.
+     * @param[in] input1          First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] input2          Second tensor input info. Data types supported: Same as @p input1.
+     * @param[in] output          Output tensor info. Data types supported: Same as @p input1.
+     * @param[in] policy          Policy to use to handle overflow.
+     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *input1,
+                   ITensorInfo               *input2,
+                   ITensorInfo               *output,
+                   const ConvertPolicy       &policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClSaturatedArithmeticKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ConvertPolicy       &policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+protected:
+    // Inherited methods overridden:
+    std::string name() override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+    std::string generate_id_for_tuning(const std::string &kernel_name,
+                                       const ITensorInfo &input1,
+                                       const ITensorInfo &output) override;
+
+private:
+    ConvertPolicy       _policy{};
+    ArithmeticOperation _op{};
+};
+
+class ClArithmeticKernel : public ClElementwiseKernel
+{
+public:
+    ClArithmeticKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClArithmeticKernel);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref ClArithmeticKernel
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] op              Arithmetic operation to be executed.
+     * @param[in] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
+     * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClArithmeticKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+protected:
+    // Inherited methods overridden:
+    std::string name() override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+
+private:
+    ArithmeticOperation _op{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
new file mode 100644
index 0000000000..f7c198ee54
--- /dev/null
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+constexpr unsigned int vector_size_byte_opencl = 16;
+
+Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
+    if (op == ElementWiseUnary::LOGICAL_NOT)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8);
+    }
+    else if (op == ElementWiseUnary::NEG)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
+    }
+    else if (op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
+    }
+
+    // Validate in case of configured dst
+    if (dst.total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClElementWiseUnaryKernel::ClElementWiseUnaryKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context,
+                                         const ITensorInfo      *src,
+                                         ITensorInfo            *dst,
+                                         const ElementWiseUnary &op)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    auto padding_info = get_padding_info({src, dst});
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
+
+    std::string kernel_name = "elementwise_unary";
+    const int   vec_size_x  = num_elems_processed_per_iteration;
+    const int   dst_width_x = dst->dimension(0);
+    if (is_data_type_quantized(src->data_type()))
+    {
+        kernel_name += "_quantized";
+    }
+    // Set kernel build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+    if (is_data_type_quantized(src->data_type()))
+    {
+        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+        build_opts.add_option("-DOFFSET_IN=" + support::cpp11::to_string(iqinfo.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset));
+        build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
+    }
+    switch (op)
+    {
+        case ElementWiseUnary::RSQRT:
+            build_opts.add_option("-DOPERATION=rsqrt_op");
+            break;
+        case ElementWiseUnary::EXP:
+            build_opts.add_option("-DOPERATION=exp_op");
+            break;
+        case ElementWiseUnary::NEG:
+            build_opts.add_option("-DOPERATION=neg_op");
+            break;
+        case ElementWiseUnary::SIN:
+            build_opts.add_option("-DOPERATION=sin_op");
+            break;
+        case ElementWiseUnary::ABS:
+            build_opts.add_option("-DOPERATION=fabs_op");
+            break;
+        case ElementWiseUnary::LOG:
+            build_opts.add_option("-DOPERATION=natural_log_op");
+            break;
+        case ElementWiseUnary::ROUND:
+            build_opts.add_option("-DOPERATION=round_op");
+            break;
+        case ElementWiseUnary::LOGICAL_NOT:
+            build_opts.add_option("-DOPERATION=logical_not_op");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst);
+    win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClElementWiseUnaryKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op)
+{
+    ARM_COMPUTE_UNUSED(op);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src, *dst, op));
+
+    return Status{};
+}
+
+void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
new file mode 100644
index 0000000000..81721f8ca8
--- /dev/null
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the elementwise unary operator */
+class ClElementWiseUnaryKernel : public IClKernel
+{
+public:
+    ClElementWiseUnaryKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementWiseUnaryKernel);
+    /** Initialise the kernel's srcs, dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             First source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     * @param[in]  op              Element wise unary operation to perform.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const ElementWiseUnary &op);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementWiseUnaryKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp
new file mode 100644
index 0000000000..96ad503730
--- /dev/null
+++ b/src/gpu/cl/kernels/ClFillKernel.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClFillKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+ClFillKernel::ClFillKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClFillKernel::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *tensor,
+                             const PixelValue       &constant_value,
+                             Window                 *window)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window));
+
+    const DataType data_type  = tensor->data_type();
+    const int      vec_size_x = 16 / tensor->element_size();
+
+    // Create and update the window (if needed)
+    _full_window = calculate_max_window(*tensor);
+    Window win   = _full_window;
+    if (window != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
+        win = *window;
+    }
+
+    const int  output_width_x = win.num_iterations(0);
+    const bool multi_access_x = output_width_x >= vec_size_x;
+    const bool remainder_x    = output_width_x % vec_size_x > 0;
+
+    if (multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
+    build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    _kernel = create_kernel(compile_context, "memset", build_opts.options());
+}
+
+Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window)
+{
+    ARM_COMPUTE_UNUSED(tensor);
+    ARM_COMPUTE_UNUSED(constant_value);
+    if (window != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
+    }
+    return Status{};
+}
+
+void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+
+    // Collapse all the batches on the third
+    Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, tensor, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h
new file mode 100644
index 0000000000..5d69fbfbd1
--- /dev/null
+++ b/src/gpu/cl/kernels/ClFillKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FILL_KERNEL_H
+#define ARM_COMPUTE_CL_FILL_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for filling the planes of a tensor */
+class ClFillKernel : public IClKernel
+{
+public:
+    ClFillKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFillKernel);
+    /** Initialise the kernel's tensor and filling value
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] tensor          Input tensor info. Supported data types: All.
+     * @param[in]     constant_value  The value used to fill the planes of the tensor
+     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClFillKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    Window _full_window{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FILL_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp
new file mode 100644
index 0000000000..358e84012b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClFloorKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClFloorKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+    // Validate in case of configured output
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClFloorKernel::ClFloorKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Auto initialize output
+    auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
+
+    // Validate
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+    auto padding_info = get_padding_info({src, dst});
+
+    const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+    const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
+    CLBuildOptions     build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "floor_layer", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(vec_size_x));
+    IClKernel::configure_internal(win);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClFloorKernel.h b/src/gpu/cl/kernels/ClFloorKernel.h
new file mode 100644
index 0000000000..6e413340ba
--- /dev/null
+++ b/src/gpu/cl/kernels/ClFloorKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FLOOR_KERNEL_H
+#define ARM_COMPUTE_CL_FLOOR_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform a floor operation */
+class ClFloorKernel : public IClKernel
+{
+public:
+    ClFloorKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFloorKernel);
+    /** Configure kernel for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data type supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Same as @p src
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClFloorKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FLOOR_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
new file mode 100644
index 0000000000..e0d925dfb2
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    if (src0->data_type() == DataType::QASYMM8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+    const int m = gemm_info.m();
+    const int n = gemm_info.n();
+    const int k = gemm_info.k();
+
+    ARM_COMPUTE_UNUSED(m);
+    ARM_COMPUTE_UNUSED(n);
+    ARM_COMPUTE_UNUSED(k);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k));
+    if (gemm_info.reinterpret_input_as_3d())
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d();
+    bool          reinterpret_dst_as_3d               = (gemm_info.depth_output_gemm3d() != 0);
+
+    Window win{};
+    bool   window_changed = false;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_dst_as_3d to be false.
+    if (reinterpret_input_as_3d == reinterpret_dst_as_3d)
+    {
+        reinterpret_dst_as_3d = false;
+    }
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src0->clone()
+                                 ->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))
+                                 .set_data_type(DataType::S32));
+
+    TensorInfo tmp_info(*dst);
+
+    if (reinterpret_dst_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    // RHS matrix still needs padding on the X
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
+
+    window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                     const ITensorInfo       *src0,
+                                                     ITensorInfo             *src1,
+                                                     ITensorInfo             *dst,
+                                                     const GEMMLHSMatrixInfo &lhs_info,
+                                                     const GEMMRHSMatrixInfo &rhs_info,
+                                                     const GEMMReshapeInfo   &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+
+    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d();
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+    // We still need padding on the X dimension for the RHS matrix
+    auto padding_info = get_padding_info({src0, dst});
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false.
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_src0 = src0->num_dimensions();
+    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by dst->info()->dimension(1) and not by gemm_info.m
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1);
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1)));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    std::string kernel_name("gemmlowp_mm_native");
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+    _config_id += "_";
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                      const ITensorInfo       *src1,
+                                                      const ITensorInfo       *dst,
+                                                      const GEMMLHSMatrixInfo &lhs_info,
+                                                      const GEMMRHSMatrixInfo &rhs_info,
+                                                      const GEMMReshapeInfo   &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
+                                                              num_elements_processed)
+                                    .first);
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if (src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if (_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    if (_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if (!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, src0, slice);
+        add_2D_tensor_argument(idx, src1, slice_b);
+        add_2D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
new file mode 100644
index 0000000000..4f87096158
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */
+class ClGemmLowpMatrixMultiplyNativeKernel : public IClKernel
+{
+public:
+    ClGemmLowpMatrixMultiplyNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyNativeKernel);
+    /** Initialise the kernel's input and dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Source tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  src1            Source tensor containing the RHS matrix. Data type supported: same as @p src0
+     * @param[out] dst             Destination tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread
+     *                             lhs_info.m0: 2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns to be processed by each thread
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.k0: same as lhs_info.k0
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000000..ddbc809cdd
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+    const int m = gemm_info.m();
+    const int n = gemm_info.n();
+    const int k = gemm_info.k();
+
+    TensorShape tensor_shape0{src0->tensor_shape()};
+    tensor_shape0.set(0, k);
+    tensor_shape0.set(1, m);
+
+    TensorShape tensor_shape1{src1->tensor_shape()};
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+
+    TensorInfo tmp_info(*dst);
+    if (reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    return std::make_pair(Status{}, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                       const ITensorInfo       *src0,
+                                                       const ITensorInfo       *src1,
+                                                       ITensorInfo             *dst,
+                                                       const GEMMLHSMatrixInfo &lhs_info,
+                                                       const GEMMRHSMatrixInfo &rhs_info,
+                                                       const GEMMReshapeInfo   &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
+    _k                        = gemm_info.k();
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensionssrc0 = src0->num_dimensions();
+    _slide_matrix_b                       = (src1->num_dimensions() >= num_dimensionssrc0);
+
+    auto              padding_info = get_padding_info({src0, src1, dst});
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1);
+
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m()));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+
+    std::string kernel_name("gemmlowp_mm_reshaped_");
+    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+    _config_id += "_";
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.v0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.interleave);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        const ITensorInfo       *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
+                                                              num_elements_processed)
+                                    .first);
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if (src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if (_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
+        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if (!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, src0, slice);
+        add_2D_tensor_argument(idx, src1, slice_b);
+        add_2D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
new file mode 100644
index 0000000000..d7b785996f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped
+ *
+ * @note The input matrices @p src0 and @p src1 must be reshaped through:
+ *  - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel
+ *  - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ */
+class ClGemmLowpMatrixMultiplyReshapedKernel : public IClKernel
+{
+public:
+    ClGemmLowpMatrixMultiplyReshapedKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedKernel);
+    /** Initialise the kernel's input and dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Source tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  src1            Source tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[out] dst             Destination tensor to store the result of matrix multiplication. Data type supported: S32
+     * @param[in]  lhs_info        LHS matrix information used for reshaping the src0 tensor.  Only the following values are supported:
+     *                             lhs_info.m0: 2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     *                             lhs_info.transpose: false
+     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.k0: same as lhs_info.k0
+     *                             rhs_info.transpose: true
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     */
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool         _slide_matrix_b{true};
+    bool         _reinterpret_output_as_3d{false};
+    unsigned int _k{1};
+    bool         _use_dummy_work_items{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
new file mode 100644
index 0000000000..2f1f3b8df0
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -0,0 +1,609 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <tuple>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    if (src0->data_type() == DataType::QASYMM8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+
+    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
+    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)),
+                                    "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+    const int m = gemm_info.m;
+    const int n = gemm_info.n;
+    const int k = gemm_info.k;
+
+    TensorShape tensor_shape1{src1->tensor_shape()};
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+    if (gemm_info.reinterpret_input_as_3d)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+        }
+    }
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+                                    "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
+
+    // Checks performed if the dst stage needs to be fused
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        // If a_offset == 0, vector_sum_col can be a nullptr
+        if (gemm_info.a_offset != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
+        }
+
+        // If b_offset == 0, vector_sum_row can be a nullptr
+        if (gemm_info.b_offset != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+            // Check if mm result is a 3D reinterpretation
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+
+            // Validate input
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
+
+            if (expected_dst_shape.num_dimensions() > 1)
+            {
+                const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+                TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+                vector_sum_row_shape.collapse_from(1);
+                TensorShape collapsed_dst_shape(expected_dst_shape);
+                collapsed_dst_shape.collapse_from(dst_batch_idx);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
+                                                "vector_sum_row must have the same number of batches of dst tensor");
+
+                if (gemm_info.a_offset != 0)
+                {
+                    TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                    vector_sum_col_shape.collapse_from(1);
+
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
+                }
+            }
+        }
+
+        if (dst->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+        if (output_multipliers != nullptr && output_shifts != nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+            if (output_stage.is_quantized_per_channel)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
+                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
+            }
+        }
+    }
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
+{
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_output_as_3d = false;
+    }
+
+    // dst tensor auto initialization if not yet initialized
+    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
+    {
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+    }
+    else
+    {
+        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32));
+    }
+
+    TensorInfo tmp_info(*dst);
+
+    if (reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
+    num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
+
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        if (gemm_info.a_offset != 0)
+        {
+            AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
+        }
+        // No access window needed for vector_sum_row
+        ARM_COMPUTE_UNUSED(vector_sum_row);
+
+        if (bias != nullptr)
+        {
+            AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win_out, bias_access);
+        }
+
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        {
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
+            window_changed =
+                window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
+        }
+    }
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
+                                                              const ITensorInfo      *src0,
+                                                              const ITensorInfo      *src1,
+                                                              ITensorInfo            *dst,
+                                                              const GEMMKernelInfo   &gemm_info,
+                                                              ITensorInfo            *vector_sum_col,
+                                                              const ITensorInfo      *vector_sum_row,
+                                                              ITensorInfo            *bias,
+                                                              ITensorInfo            *output_multipliers,
+                                                              ITensorInfo            *output_shifts)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
+
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
+    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
+    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+    const int32_t                 a_offset     = gemm_info.a_offset;
+    const int32_t                 b_offset     = gemm_info.b_offset;
+
+    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
+    _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0);
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_src0 = src0->num_dimensions();
+    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % internal_m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type()));
+
+    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
+    kernel_name += rhs_info.transpose ? "t" : "nt";
+
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        kernel_name += "_fused_output_stage_fixedpoint";
+        _fuse_output_stage = true;
+        // If a_offset == 0, vector_sum_col can be a nullptr
+        if (a_offset != 0 && vector_sum_col != nullptr)
+        {
+            build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+            build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+        }
+        // If b_offset == 0, vector_sum_row can be a nullptr
+        build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0)));
+        build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+        build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+        // In case of _is_quantized_per_channel, RESULT_MULTIPLIER and RESULT_SHIFT are not utilized, but they are passed as a part of T_QUANTIZE8 macro.
+        if (!_is_quantized_per_channel)
+        {
+            build_opts.add_option("-DRESULT_MULTIPLIER=" +
+                                  support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+            build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+        }
+        else
+        {
+            build_opts.add_option("-DRESULT_MULTIPLIER=0");
+            build_opts.add_option("-DRESULT_SHIFT=0");
+        }
+        build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
+
+        const int min = output_stage.gemmlowp_min_bound;
+        const int max = output_stage.gemmlowp_max_bound;
+
+        PixelValue min_val{};
+        PixelValue max_val{};
+        std::tie(min_val, max_val) = get_min_max(dst->data_type());
+        build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+        build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    }
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : "";
+    _config_id += "_";
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo    *src0,
+                                                               const ITensorInfo    *src1,
+                                                               const ITensorInfo    *dst,
+                                                               const GEMMKernelInfo &gemm_info,
+                                                               const ITensorInfo    *vector_sum_col,
+                                                               const ITensorInfo    *vector_sum_row,
+                                                               const ITensorInfo    *bias,
+                                                               const ITensorInfo    *output_multipliers,
+                                                               const ITensorInfo    *output_shifts)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    if (src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if (_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
+        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    if (_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if (!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, src0, slice);
+        add_2D_tensor_argument(idx, src1, slice_b);
+        add_2D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+        if (_reinterpret_input_as_3d)
+        {
+            // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+            idx++;
+        }
+
+        if (_reinterpret_output_as_3d)
+        {
+            // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+            idx++;
+        }
+
+        if (_fuse_output_stage)
+        {
+            add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+            add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+            add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
+            add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
+        }
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
new file mode 100644
index 0000000000..1d4696b089
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has been reshaped
+ *
+ * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
+ */
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel : public IClKernel
+{
+public:
+    ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  src0               Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  src1               Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0
+     * @param[out] dst                Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+     *                                Only the following values are supported for LHS info:
+     *                                lhs_info.m0: 2,3,4,5,6,7,8
+     *                                lhs_info.k0: 2,3,4,8,16
+     *                                Only the following values are supported for RHS info:
+     *                                rhs_info.n0: 2,3,4,8,16
+     *                                rhs_info.k0: same as lhs_info.k0
+     *                                rhs_info.transpose: true
+     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+     * @param[in]  bias               (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: S32.
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32.
+     * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
+    bool _is_quantized_per_channel{false};
+    bool _fuse_output_stage{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
new file mode 100644
index 0000000000..030c11d069
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace misc::shape_calculator;
+
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+
+    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
+    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4),
+                                    "Only 1,2,4 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8),
+                                    "Only 1,4,8 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
+
+    const int m = gemm_info.m;
+    const int n = gemm_info.n;
+    const int k = gemm_info.k;
+
+    TensorShape tensor_shape1{src1->tensor_shape()};
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
+    if (gemm_info.reinterpret_input_as_3d)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+        }
+    }
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+                                    "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
+
+    // Checks performed if the dst stage needs to be fused
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        // If a_offset == 0, vector_sum_col can be a nullptr
+        if (gemm_info.a_offset != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
+        }
+
+        // If b_offset == 0, vector_sum_row can be a nullptr
+        if (gemm_info.b_offset != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+            // Check if mm result is a 3D reinterpretation
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+
+            // Validate input
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
+
+            if (expected_dst_shape.num_dimensions() > 1)
+            {
+                const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+                TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+                vector_sum_row_shape.collapse_from(1);
+                TensorShape collapsed_dst_shape(expected_dst_shape);
+                collapsed_dst_shape.collapse_from(dst_batch_idx);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
+                                                "vector_sum_row must have the same number of batches of dst tensor");
+
+                if (gemm_info.a_offset != 0)
+                {
+                    TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                    vector_sum_col_shape.collapse_from(1);
+
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
+                }
+            }
+        }
+
+        if (dst->total_size() != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+        if (output_multipliers != nullptr && output_shifts != nullptr)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+            ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+            if (output_stage.is_quantized_per_channel)
+            {
+                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
+                ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
+            }
+        }
+    }
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
+{
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d != 0);
+
+    Window win{};
+    bool   window_changed = false;
+
+    constexpr unsigned int mmul_n0 = 4;
+    constexpr unsigned int mmul_m0 = 4;
+    constexpr unsigned int mmul_k0 = 16;
+
+    reinterpret_output_as_3d = false;
+    // dst tensor auto initialization if not yet initialized
+    const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
+    {
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+    }
+    else
+    {
+        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32));
+    }
+
+    TensorInfo tmp_info(*dst);
+
+    if (reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = 1;
+    num_elems_processed_per_iteration_y = 1;
+
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        if (gemm_info.a_offset != 0)
+        {
+            AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
+        }
+        // No access window needed for vector_sum_row
+        ARM_COMPUTE_UNUSED(vector_sum_row);
+
+        if (bias != nullptr)
+        {
+            AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
+            window_changed = window_changed || update_window_and_padding(win, bias_access);
+        }
+
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        {
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
+            window_changed =
+                window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
+        }
+    }
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    Window             collapsed             = win.collapse(win, dimension_to_collapse);
+
+    // Reconfigure window size, one arm_matrix_multiply kernel needs 16 threads to finish.
+    Window::Dimension x_dimension = collapsed.x();
+    Window::Dimension y_dimension = collapsed.y();
+
+    // Make M and N multiple of M0 and N0 respectively
+    const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(x_dimension.end(), gemm_info.rhs_info.n0);
+    const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(y_dimension.end(), gemm_info.lhs_info.m0);
+
+    // Divide M and N by M0 and N0 respectively
+    const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / gemm_info.rhs_info.n0;
+    const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / gemm_info.lhs_info.m0;
+
+    // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_k0 respectively
+    const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
+    const unsigned int ceil_to_multiple_m_div_m0_mmul_m0 = ceil_to_multiple(m_div_m0, mmul_k0);
+
+    // Ensure x_dimension is multiple of MMUL block size (mmul_n0 * mmul_m0)
+    x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_n0);
+    y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_m0 / mmul_m0);
+
+    collapsed.set(Window::DimX, x_dimension);
+    collapsed.set(Window::DimY, y_dimension);
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context,
+                                                                  const ITensorInfo      *src0,
+                                                                  const ITensorInfo      *src1,
+                                                                  ITensorInfo            *dst,
+                                                                  const GEMMKernelInfo   &gemm_info,
+                                                                  ITensorInfo            *vector_sum_col,
+                                                                  const ITensorInfo      *vector_sum_row,
+                                                                  ITensorInfo            *bias,
+                                                                  ITensorInfo            *output_multipliers,
+                                                                  ITensorInfo            *output_shifts)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
+
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
+    const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
+    const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
+    const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
+    const int32_t                 a_offset     = gemm_info.a_offset;
+    const int32_t                 b_offset     = gemm_info.b_offset;
+    constexpr int                 mmul_m0      = 4;
+    constexpr int                 mmul_n0      = 4;
+    constexpr int                 mmul_k0      = 16;
+
+    _m = gemm_info.m;
+    _n = gemm_info.n;
+    _k = gemm_info.k;
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    const unsigned int m0_leftover = _m % lhs_info.m0;
+    const unsigned int n0_leftover = _n % rhs_info.n0;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option("-DVEC_TYPE=" + get_cl_type_from_data_type(src0->data_type()) + "4");
+    build_opts.add_option("-DACC_DATA_TYPE=int");
+    build_opts.add_option("-DOUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+    build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+    build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+    build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+    build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+    std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul");
+
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    {
+        build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT");
+        _fuse_output_stage = true;
+        // If a_offset == 0, vector_sum_col can be a nullptr
+        if (a_offset != 0 && vector_sum_col != nullptr)
+        {
+            build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+            build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+        }
+        // If b_offset == 0, vector_sum_row can be a nullptr
+        build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+        build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0)));
+        build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+        build_opts.add_option_if(gemm_info.broadcast_bias == true, "-DBROADCAST_BIAS");
+        build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+        build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+        build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+
+        const int min = output_stage.gemmlowp_min_bound;
+        const int max = output_stage.gemmlowp_max_bound;
+
+        PixelValue min_val{};
+        PixelValue max_val{};
+        std::tie(min_val, max_val) = get_min_max(dst->data_type());
+        build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+        build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    }
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (bias != nullptr ? "add_bias_" : "");
+    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+    _config_id += lower_string(string_from_data_type(src0->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_m);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_n);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo    *src0,
+                                                                   const ITensorInfo    *src1,
+                                                                   const ITensorInfo    *dst,
+                                                                   const GEMMKernelInfo &gemm_info,
+                                                                   const ITensorInfo    *vector_sum_col,
+                                                                   const ITensorInfo    *vector_sum_row,
+                                                                   const ITensorInfo    *bias,
+                                                                   const ITensorInfo    *output_multipliers,
+                                                                   const ITensorInfo    *output_shifts)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                               const Window     &window,
+                                                               cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    if (src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    cl::Image2D src1_image2d;
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+
+        add_3d_tensor_nhw_argument(idx, src0);
+        add_3d_tensor_nhw_argument(idx, src1);
+
+        // Bias buffer (_add_bias == true)
+        if (src2 != nullptr)
+        {
+            add_3d_tensor_nhw_argument(idx, src2);
+        }
+        // dst buffer
+        add_3d_tensor_nhw_argument(idx, dst);
+
+        // Pass m, n and k at runtime as signed ints, to ensure results of any subtraction they could be operand in, would still be signed.
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
+        if (_fuse_output_stage)
+        {
+            if (vector_sum_col != nullptr)
+            {
+                add_3d_tensor_nhw_argument(idx, vector_sum_col);
+            }
+            if (vector_sum_row != nullptr)
+            {
+                add_3d_tensor_nhw_argument(idx, vector_sum_row);
+            }
+        }
+
+        enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
new file mode 100644
index 0000000000..fc8b73140d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data types when only the input matrix RHS (src1) has been reshaped using the MMUL instruction
+ *
+ * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel
+ * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported
+ */
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel : public IClKernel
+{
+public:
+    ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  src0               Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  src1               Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0
+     * @param[out] dst                Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32.
+     * @param[in]  gemm_info          GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info.
+     *                                lhs_info.m0: 1,2,4
+     *                                Only the following values are supported for RHS info:
+     *                                rhs_info.n0: 1,4,8
+     *                                rhs_info.k0: same as lhs_info.k0: 4
+     *                                rhs_info.transpose: true
+     * @param[in]  vector_sum_col     (Optional) Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32
+     * @param[in]  vector_sum_row     (Optional) Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32
+     * @param[in]  bias               (Optional) Biases tensor. Can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM] or same dimensionality as dst if gemm_info.broadcast_bias is false. Data type supported: S32.
+     * @param[in]  output_multipliers (Optional) Output multipliers tensor. Supported data types: S32.
+     * @param[in]  output_shifts      (Optional) Output shifts tensor. Supported data types: S32.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool       _fuse_output_stage{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
new file mode 100644
index 0000000000..d93dbde95a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          const ITensorInfo *bias,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if (a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if (b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if (output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if (a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context,
+                                                   const ITensorInfo      *mm_result,
+                                                   const ITensorInfo      *vector_sum_col,
+                                                   const ITensorInfo      *vector_sum_row,
+                                                   const ITensorInfo      *bias,
+                                                   int32_t                 k,
+                                                   int32_t                 a_offset,
+                                                   int32_t                 b_offset)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+
+    auto padding_info = get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias});
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if (a_offset != 0)
+    {
+        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+        build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+    }
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    std::string kernel_name("gemmlowp_offset_contribution");
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+    IClKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name + "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                    const ITensorInfo *vector_sum_col,
+                                                    const ITensorInfo *vector_sum_row,
+                                                    const ITensorInfo *bias,
+                                                    int32_t            a_offset,
+                                                    int32_t            b_offset)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+    return Status{};
+}
+
+void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
+
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
+
+    Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, mm_result, slice);
+        add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+        add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+        add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
new file mode 100644
index 0000000000..2080a3a091
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
+ */
+class ClGemmLowpOffsetContributionKernel : public IClKernel
+{
+public:
+    ClGemmLowpOffsetContributionKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] mm_result       Input tensor containing the result of the matrix multiplication. Data type supported: S32
+     * @param[in]      vector_sum_col  Input row-vector of sums of all the entries in each column of matrix B.
+     *                                 Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      vector_sum_row  Input row-vector of sums of all the entries in each row of matrix A.
+     *                                 Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]      bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                 Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in]      k               Number of matrix A columns or Matrix B rows
+     * @param[in]      a_offset        Offset to be added to each element of the matrix A.
+     * @param[in]      b_offset        Offset to be added to each element of the matrix B.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   int32_t                 k,
+                   int32_t                 a_offset,
+                   int32_t                 b_offset);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpOffsetContributionKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           const ITensorInfo *bias,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000000..26f479f61a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
@@ -0,0 +1,307 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo             *mm_result,
+                          const ITensorInfo             *vector_sum_col,
+                          const ITensorInfo             *vector_sum_row,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          int32_t                        a_offset,
+                          int32_t                        b_offset,
+                          const GEMMLowpOutputStageInfo &output_stage,
+                          const ITensorInfo             *output_multipliers,
+                          const ITensorInfo             *output_shifts)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
+    if (output_stage.is_quantized_per_channel)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
+    }
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if (a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if (b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if (output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if (a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
+    // Checks performed when output is configured
+    if ((dst != nullptr) && (dst->total_size() != 0))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, dst);
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(),
+                                    "per channel quantization info is incorrect");
+
+    return Status{};
+}
+} // namespace
+
+ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutputStageKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *mm_result,
+                                                              const ITensorInfo             *vector_sum_col,
+                                                              const ITensorInfo             *vector_sum_row,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
+                                                              int32_t                        k,
+                                                              int32_t                        a_offset,
+                                                              int32_t                        b_offset,
+                                                              const GEMMLowpOutputStageInfo &output_stage,
+                                                              const ITensorInfo             *output_multipliers,
+                                                              const ITensorInfo             *output_shifts)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                  b_offset, output_stage, output_multipliers, output_shifts));
+
+    auto padding_info =
+        get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts});
+
+    const int min = output_stage.gemmlowp_min_bound;
+    const int max = output_stage.gemmlowp_max_bound;
+
+    _is_quantized_per_channel = output_stage.is_quantized_per_channel;
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+    // Auto initialize the output
+    auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if (a_offset != 0)
+    {
+        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+        build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+    }
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+    build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
+    build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION");
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+
+    PixelValue min_val{};
+    PixelValue max_val{};
+    std::tie(min_val, max_val) = get_min_max(dst->data_type());
+    build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+
+    std::string kernel_name("gemmlowp_offset_contribution");
+    kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name + "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo             *mm_result,
+                                                               const ITensorInfo             *vector_sum_col,
+                                                               const ITensorInfo             *vector_sum_row,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
+                                                               int32_t                        a_offset,
+                                                               int32_t                        b_offset,
+                                                               const GEMMLowpOutputStageInfo &output_stage,
+                                                               const ITensorInfo             *output_multipliers,
+                                                               const ITensorInfo             *output_shifts)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                   b_offset, output_stage, output_multipliers, output_shifts));
+    return Status{};
+}
+
+void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto mm_result =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, mm_result, slice);
+        add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
+        add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
+        add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
+        add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
new file mode 100644
index 0000000000..97ee9bc97f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage.
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution
+ * of matrix A and matrix B and performs the output stage defined by the output_stage argument
+ *
+ * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo.
+ */
+class ClGemmLowpOffsetContributionOutputStageKernel : public IClKernel
+{
+public:
+    ClGemmLowpOffsetContributionOutputStageKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionOutputStageKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  mm_result          Input tensor containing the result of the matrix multiplication. Data type supported: S32
+     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result.
+     * @param[out] dst                Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  k                  Number of matrix A columns or Matrix B rows
+     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
+     * @param[in]  output_stage       GEMMLowp output stage info
+     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     */
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *mm_result,
+                   const ITensorInfo             *vector_sum_col,
+                   const ITensorInfo             *vector_sum_row,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   int32_t                        k,
+                   int32_t                        a_offset,
+                   int32_t                        b_offset,
+                   const GEMMLowpOutputStageInfo &output_stage,
+                   const ITensorInfo             *output_multipliers,
+                   const ITensorInfo             *output_shifts);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo             *mm_result,
+                           const ITensorInfo             *vector_sum_col,
+                           const ITensorInfo             *vector_sum_row,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           int32_t                        a_offset,
+                           int32_t                        b_offset,
+                           const GEMMLowpOutputStageInfo &output_stage,
+                           const ITensorInfo             *output_multipliers,
+                           const ITensorInfo             *output_shifts);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool _is_quantized_per_channel{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
new file mode 100644
index 0000000000..7b7beab12c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+
+    // Check biases if exist
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo             *src,
+                                                                    const ITensorInfo             *bias,
+                                                                    const ITensorInfo             *dst,
+                                                                    const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+
+    return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext        &compile_context,
+                                                                   const ITensorInfo             *src,
+                                                                   const ITensorInfo             *bias,
+                                                                   ITensorInfo                   *dst,
+                                                                   const GEMMLowpOutputStageInfo *info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+    auto padding_info = get_padding_info({src, bias, dst});
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+    // Set the arguments to pass at compile time
+    auto           min = info->gemmlowp_min_bound;
+    auto           max = info->gemmlowp_max_bound;
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
+    build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if(
+        (min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if(
+        (max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    // Create kernel
+    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16)
+                                        ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16"
+                                        : "gemmlowp_output_stage_quantize_down_fixedpoint";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                const Window     &window,
+                                                                cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Create src window
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Setup bias slice
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if (bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx1, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
new file mode 100644
index 0000000000..71c9f4b752
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel : public IClKernel
+{
+public:
+    ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
+     * @param[in]  info            Output stage info. Used to pass the quantized output data type
+     */
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
new file mode 100644
index 0000000000..52ebd32d46
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) &&
+                                (info->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) ||
+        info->gemmlowp_min_bound > info->gemmlowp_max_bound);
+
+    // Check biases if exist
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo             *src,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
+                                                               const GEMMLowpOutputStageInfo *info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));
+
+    return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *src,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
+                                                              const GEMMLowpOutputStageInfo *info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
+
+    auto padding_info = get_padding_info({src, bias, dst});
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+    auto min = info->gemmlowp_min_bound;
+    auto max = info->gemmlowp_max_bound;
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
+    build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    const std::string kernel_name = "gemmlowp_output_stage_quantize_down_float";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Create input window
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Setup bias slice
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if (bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx1, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
new file mode 100644
index 0000000000..057c66767f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleByFloatKernel : public IClKernel
+{
+public:
+    ClGemmLowpQuantizeDownInt32ScaleByFloatKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFloatKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  info            Output stage info. Used to pass the quantized output data type
+     */
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
new file mode 100644
index 0000000000..31434ce61b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) &&
+                                (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+
+    // Check biases if exist
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type,
+                                        "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+    }
+
+    return Status{};
+}
+} //namespace
+
+ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                        const ITensorInfo             *bias,
+                                                        const ITensorInfo             *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
+
+    return Status{};
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext        &compile_context,
+                                                       const ITensorInfo             *src,
+                                                       const ITensorInfo             *bias,
+                                                       ITensorInfo                   *dst,
+                                                       const GEMMLowpOutputStageInfo *output_stage)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
+
+    auto padding_info = get_padding_info({src, bias, dst});
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+    // Set the arguments to pass at compile time
+    auto           min = output_stage->gemmlowp_min_bound;
+    auto           max = output_stage->gemmlowp_max_bound;
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
+    build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
+    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
+                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
+                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    const std::string kernel_name = "gemmlowp_output_stage_quantize_down";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    unsigned int idx1 = num_arguments_per_3D_tensor();
+    if (bias != nullptr)
+    {
+        Window biases_slice(slice);
+        biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+        biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        add_1D_tensor_argument(idx1, bias, biases_slice);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx1, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
new file mode 100644
index 0000000000..e6390801f1
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ * The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *  -#  -to the [0..255] range and cast to QASYMM8.
+ *  -#  -to the [-128..127] range and cast to QASYMM8_SIGNED.
+ */
+class ClGemmLowpQuantizeDownInt32ScaleKernel : public ICLKernel
+{
+public:
+    ClGemmLowpQuantizeDownInt32ScaleKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleKernel);
+    /** Initialise the kernel's source and destination.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+     * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  output_stage    GEMMLowp output stage metadata.
+     */
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *output_stage);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
new file mode 100644
index 0000000000..ee4a191fed
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
+
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
+    }
+    return Status{};
+}
+
+Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
+    }
+    return Status{};
+}
+} // namespace
+
+IClGemmLowpReductionKernel::IClGemmLowpReductionKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_a,
+                                                 ITensorInfo                       *vector_sum_row,
+                                                 const GEMMLowpReductionKernelInfo &info)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32);
+
+    auto padding_info = get_padding_info({mtx_a, vector_sum_row});
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->dimension(0)));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->data_type()));
+    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
+
+    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+
+    std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    // This kernel does not need padding
+    Window win = calculate_max_window(*vector_sum_row, Steps());
+    ICLKernel::configure_internal(win);
+
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mtx_a->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mtx_a->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mtx_a->dimension(2));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *mtx_a,
+                                                  const ITensorInfo                 *vector_sum_row,
+                                                  const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
+    Window slice_in  = collapsed.first_slice_window_2D();
+    Window slice_out = collapsed.first_slice_window_2D();
+
+    // Setup input slice. Its dimensions are increased in the cl kernel.
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice_in);
+        add_2D_tensor_argument(idx, dst, slice_out);
+        enqueue(queue, *this, slice_out, lws_hint());
+    } while (collapsed.slide_window_slice_2D(slice_out));
+}
+
+void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_b,
+                                                 ITensorInfo                       *vector_sum_col,
+                                                 const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32);
+
+    auto padding_info = get_padding_info({mtx_b, vector_sum_col});
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0));
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0)));
+    build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1)));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type()));
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->data_type()));
+    build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar));
+
+    const std::string kernel_name = "gemmlowp_matrix_b_reduction";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*vector_sum_col, Steps(num_elems_processed_per_iteration));
+    IClKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *mtx_b,
+                                                  const ITensorInfo                 *vector_sum_col,
+                                                  const GEMMLowpReductionKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
+
+    Window slice_out = collapsed.first_slice_window_2D();
+    Window slice_in  = slice_out;
+
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice_in);
+        add_2D_tensor_argument(idx, dst, slice_out);
+        enqueue(queue, *this, slice_out, lws_hint());
+    } while (collapsed.slide_window_slice_2D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
new file mode 100644
index 0000000000..c81543e4c2
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
@@ -0,0 +1,136 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
+#define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Common interface for all OpenCL reduction kernels */
+class IClGemmLowpReductionKernel : public IClKernel
+{
+public:
+    IClGemmLowpReductionKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmLowpReductionKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[out] output          Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    virtual void configure(const CLCompileContext            &compile_context,
+                           const ITensorInfo                 *input,
+                           ITensorInfo                       *output,
+                           const GEMMLowpReductionKernelInfo &info) = 0;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class ClGemmLowpMatrixAReductionKernel : public IClGemmLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  mtx_a           Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8.
+     * @param[out] vector_sum_row  Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_a,
+                   ITensorInfo                       *vector_sum_row,
+                   const GEMMLowpReductionKernelInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ */
+class ClGemmLowpMatrixBReductionKernel : public IClGemmLowpReductionKernel
+{
+public:
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  mtx_b           Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
+     * @param[out] vector_sum_col  Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
+     * @param[in]  info            Kernel metadata:
+     *                             - k            Number of matrix columns/rows depending on the type of reduction.
+     *                             - is_reshaped  True if the matrix has been reshaped.
+     *                             - scalar       Scalar value to multiply each reduced column/row by.
+     *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
+     */
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_b,
+                   ITensorInfo                       *vector_sum_col,
+                   const GEMMLowpReductionKernelInfo &info) override;
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
new file mode 100644
index 0000000000..fd23aa9924
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
+
+    const unsigned int m = gemm_info.m;
+    const unsigned int n = gemm_info.n;
+    const unsigned int k = gemm_info.k;
+
+    ARM_COMPUTE_UNUSED(m);
+    ARM_COMPUTE_UNUSED(n);
+    ARM_COMPUTE_UNUSED(k);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k);
+    if (gemm_info.reinterpret_input_as_3d)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
+    }
+
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    {
+        const unsigned int src2_dim0 = src2->dimension(0);
+        const unsigned int src2_dim1 = src2->dimension(1);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
+        if (gemm_info.broadcast_bias)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
+        }
+    }
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
+{
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
+    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
+
+    Window win{};
+    Window win_out{};
+    bool   window_changed = false;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
+    {
+        reinterpret_output_as_3d = false;
+    }
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+    TensorInfo tmp_info(*dst);
+
+    if (reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
+    AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
+
+    if (src2 != nullptr)
+    {
+        const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
+
+        AccessWindowStatic src2_access(src2, 0, 0, ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
+                                       src2->dimension(1));
+
+        window_changed = update_window_and_padding(win, src0_access, src1_access,
+                                                   src2_access) || // window used by the execute_window_loop
+                         update_window_and_padding(
+                             win_out, dst_access); // window used to update the padding requirements of dst tensor
+    }
+    else
+    {
+        window_changed =
+            update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+            update_window_and_padding(win_out,
+                                      dst_access); // window used to update the padding requirements of dst tensor
+    }
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                 ITensorInfo             *src0,
+                                                 ITensorInfo             *src1,
+                                                 ITensorInfo             *src2,
+                                                 ITensorInfo             *dst,
+                                                 float                    alpha,
+                                                 float                    beta,
+                                                 const GEMMLHSMatrixInfo &lhs_info,
+                                                 const GEMMRHSMatrixInfo &rhs_info,
+                                                 const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
+    auto padding_info         = get_padding_info({src0, dst});
+    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
+    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+    _add_bias                 = src2 != nullptr;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_src0 = src0->num_dimensions();
+    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info,
+                                                    rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second);
+
+    // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
+    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
+    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+    _m                             = internal_m;
+    _n                             = gemm_info.n;
+    _k                             = gemm_info.k;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
+    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
+    build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+    std::string kernel_name("gemm_mm_native");
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (_add_bias ? "add_bias_" : "");
+    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+    _config_id += lower_string(string_from_data_type(src0->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.k0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                  const ITensorInfo       *src1,
+                                                  const ITensorInfo       *src2,
+                                                  const ITensorInfo       *dst,
+                                                  float                    alpha,
+                                                  float                    beta,
+                                                  const GEMMLHSMatrixInfo &lhs_info,
+                                                  const GEMMRHSMatrixInfo &rhs_info,
+                                                  const GEMMKernelInfo    &gemm_info)
+{
+    ElementsProcessed num_elements_processed{};
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              src2 != nullptr ? src2->clone().get() : nullptr,
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
+                                                              num_elements_processed)
+                                    .first);
+
+    return Status{};
+}
+
+void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
+
+    if (src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    if (_reinterpret_input_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
+        unsigned int idx0;
+        if (_add_bias)
+        {
+            idx0 = 4 * num_arguments_per_2D_tensor() + 7;
+        }
+        else
+        {
+            idx0 = 3 * num_arguments_per_2D_tensor() + 6;
+        }
+        const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    if (_reinterpret_output_as_3d)
+    {
+        // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
+        unsigned int idx0;
+        if (_add_bias)
+        {
+            idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
+        }
+        else
+        {
+            idx0 = 3 * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0);
+        }
+        const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+        _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if (!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+        add_2D_tensor_argument(idx, src0, slice);
+        add_2D_tensor_argument(idx, src1, slice_b);
+        if (_add_bias)
+        {
+            add_2D_tensor_argument(idx, src2, slice);
+        }
+        add_2D_tensor_argument(idx, dst, slice);
+
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+        if (_add_bias)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
+        }
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+
+        // Pass m, n and k at runtime
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
new file mode 100644
index 0000000000..da6c9a5bb7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */
+class ClGemmMatrixMultiplyNativeKernel : public IClKernel
+{
+public:
+    ClGemmMatrixMultiplyNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyNativeKernel);
+    /** Initialise the kernel's input and dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Input tensor for the LHS matrix. Data type supported: F32/F16. The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  src1            Input tensor for the RHS matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
+     * @param[out] dst             dst tensor info. Data type supported: same as @p src0
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of the matrix bias
+     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
+     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.k0: same of lhs_info.k0
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmMatrixMultiplyNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
new file mode 100644
index 0000000000..4fe6bddb36
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -0,0 +1,447 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3),
+                                    "Only 2,3,4,8,16 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32),
+                                    "Mixed precision only supported for F16 data type");
+    ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
+
+    const unsigned int m = gemm_info.m;
+    const unsigned int n = gemm_info.n;
+    const unsigned int k = gemm_info.k;
+
+    TensorShape tensor_shape0{src0->tensor_shape()};
+    tensor_shape0.set(0, k);
+    tensor_shape0.set(1, m);
+
+    TensorShape tensor_shape1{src1->tensor_shape()};
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    {
+        const unsigned int src2_dim0 = src2->dimension(0);
+        const unsigned int src2_dim1 = src2->dimension(1);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
+        if (gemm_info.broadcast_bias)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
+        }
+    }
+
+    const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
+{
+    ARM_COMPUTE_UNUSED(src0, src1, src2);
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
+
+    TensorInfo tmp_info(*dst);
+
+    if (reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window             collapsed             = win;
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    collapsed                                = win.collapse(win, dimension_to_collapse);
+
+    return std::make_pair(Status{}, collapsed);
+}
+} // namespace
+
+ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                   const ITensorInfo       *src0,
+                                                   const ITensorInfo       *src1,
+                                                   const ITensorInfo       *src2,
+                                                   ITensorInfo             *dst,
+                                                   float                    alpha,
+                                                   float                    beta,
+                                                   const GEMMLHSMatrixInfo &lhs_info,
+                                                   const GEMMRHSMatrixInfo &rhs_info,
+                                                   const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
+    auto padding_info         = get_padding_info({src0, src1, src2, dst});
+    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+    _add_bias                 = src2 != nullptr;
+    _export_to_cl_image       = rhs_info.export_to_cl_image;
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_src0 = src0->num_dimensions();
+    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(
+        src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+        lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    const bool     enable_mixed_precision = gemm_info.fp_mixed_precision;
+    const DataType data_type              = src0->data_type();
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+    const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+    _m                                  = gemm_info.m;
+    _n                                  = gemm_info.n;
+    _k                                  = gemm_info.k;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
+    build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION");
+    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
+    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision
+                                                            ? get_cl_type_from_data_type(DataType::F32)
+                                                            : get_cl_type_from_data_type(data_type)));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+    build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+    std::string kernel_name("gemm_mm_reshaped_");
+    kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
+    kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt";
+    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (_add_bias ? "add_bias_" : "");
+    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+    _config_id += lower_string(string_from_data_type(src0->data_type()));
+    _config_id += "_";
+    _config_id += (enable_mixed_precision ? "mixed_precision_" : "");
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.v0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.interleave);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                    const ITensorInfo       *src1,
+                                                    const ITensorInfo       *src2,
+                                                    const ITensorInfo       *dst,
+                                                    float                    alpha,
+                                                    float                    beta,
+                                                    const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+    return Status{};
+}
+
+void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
+
+    if (src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
+
+    cl::Image2D src1_image2d;
+
+    if (_export_to_cl_image)
+    {
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
+        const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
+
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if (!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+
+        // LHS buffer
+        add_2D_tensor_argument(idx, src0, slice);
+
+        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
+        if (_export_to_cl_image)
+        {
+            _kernel.setArg(idx++, src1_image2d);
+        }
+        else
+        {
+            add_2D_tensor_argument(idx, src1, slice_b);
+        }
+
+        // Bias buffer (_add_bias == true)
+        add_2D_tensor_argument_if(_add_bias, idx, src2, slice);
+
+        // dst buffer
+        add_2D_tensor_argument(idx, dst, slice);
+
+        // LHS stride_z
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
+
+        // RHS stride_z (not used if _export_to_cl_image == true)
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
+
+        // Bias stride_z (if _add_bias == true)
+        if (_add_bias)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
+        }
+
+        // dst stride_z
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
+
+        // Cross-plan padding (if _reinterpret_output_as_3d = true)
+        if (_reinterpret_output_as_3d)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
+        }
+
+        // Pass m, n and k at runtime
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+
+        // K dimension (not used if _export_to_cl_image == true)
+        _kernel.setArg<cl_int>(idx++, _k);
+
+        // Dispatch kernel
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
new file mode 100644
index 0000000000..30928c4e1d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped
+ *
+ * @note The input matrices @p src0 and @p src1 must be reshaped through:
+ *  - @ref ClGemmReshapeLhsMatrixKernel
+ *  - @ref ClGemmReshapeRhsMatrixKernel
+ */
+class ClGemmMatrixMultiplyReshapedKernel : public IClKernel
+{
+public:
+    ClGemmMatrixMultiplyReshapedKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag.
+     *       Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the
+     *       multiplications. i.e. float c = (half)a * (half)b
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement
+     *       -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32  (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4
+     * @param[in]  src1            Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3
+     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
+     * @param[out] dst             dst tensor to store the result of matrix multiplication. Data type supported: same as @p src0
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of the matrix bias
+     * @param[in]  lhs_info        LHS matrix information used for reshaping the src0 tensor.  Only the following values are supported:
+     *                             lhs_info.m0: 2,3,4,5,6,7,8
+     *                             lhs_info.k0: 2,3,4,8,16
+     *                             lhs_info.transpose: false
+     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                             rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true)
+     *                             rhs_info.transpose: true
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     *
+     * @note lhs_info.k0 must be equal to rhs_info.k0
+     */
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
new file mode 100644
index 0000000000..1b19f1ec5b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
+    ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
+
+    const unsigned int m = gemm_info.m;
+    const unsigned int n = gemm_info.n;
+    const unsigned int k = gemm_info.k;
+
+    TensorShape tensor_shape1{src1->tensor_shape()};
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    {
+        const unsigned int src2_dim0 = src2->dimension(0);
+        const unsigned int src2_dim1 = src2->dimension(1);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0);
+        if (gemm_info.broadcast_bias)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
+        }
+    }
+
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
+    if (gemm_info.reinterpret_input_as_3d)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+    }
+
+    return Status{};
+}
+
+Window validate_and_configure_window(ITensorInfo             *src0,
+                                     ITensorInfo             *src1,
+                                     ITensorInfo             *src2,
+                                     ITensorInfo             *dst,
+                                     const GEMMLHSMatrixInfo &lhs_info,
+                                     const GEMMRHSMatrixInfo &rhs_info,
+                                     const GEMMKernelInfo    &gemm_info,
+                                     ElementsProcessed       &num_elements_processed)
+{
+    ARM_COMPUTE_UNUSED(src0, src1, src2);
+    unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
+    unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
+    bool          reinterpret_input_as_3d             = gemm_info.reinterpret_input_as_3d;
+    bool          reinterpret_output_as_3d            = gemm_info.depth_output_gemm3d != 0;
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    // This approach should only be used when the input/dst tensors have pad on the y direction
+    if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
+    {
+        reinterpret_output_as_3d = false;
+    }
+
+    TensorInfo tmp_info(*dst);
+
+    if (reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // Configure kernel window
+    num_elems_processed_per_iteration_x = rhs_info.n0;
+    num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    Window             collapsed             = win.collapse(win, dimension_to_collapse);
+
+    return collapsed;
+}
+} // namespace
+
+ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext  &compile_context,
+                                                          const ITensorInfo       *src0,
+                                                          const ITensorInfo       *src1,
+                                                          const ITensorInfo       *src2,
+                                                          ITensorInfo             *dst,
+                                                          float                    alpha,
+                                                          float                    beta,
+                                                          const GEMMLHSMatrixInfo &lhs_info,
+                                                          const GEMMRHSMatrixInfo &rhs_info,
+                                                          const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
+    _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
+    _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+    _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
+    _add_bias                 = src2 != nullptr;
+    _export_to_cl_image       = rhs_info.export_to_cl_image;
+    _has_pad_y                = gemm_info.has_pad_y;
+
+    auto padding_info = get_padding_info({src0, src1, src2, dst});
+
+    // In case both input and dst have to be reinterpreted as 3D tensors,
+    // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
+    if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
+    {
+        _reinterpret_input_as_3d  = false;
+        _reinterpret_output_as_3d = false;
+    }
+
+    // Check if we need to slide the matrix B
+    const unsigned int num_dimensions_src0 = src0->num_dimensions();
+    _slide_matrix_b                        = (src1->num_dimensions() >= num_dimensions_src0);
+
+    ElementsProcessed num_elements_processed{};
+
+    // Configure kernel window
+    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                               (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+                                               lhs_info, rhs_info, gemm_info, num_elements_processed);
+    ICLKernel::configure_internal(win);
+
+    // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true,
+    // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
+    // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1);
+
+    // These variables are used only if gemm_info.has_pad_y == true
+    const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1);
+    const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2);
+
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = internal_m % internal_m0;
+    const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+    _m                                  = internal_m;
+    _n                                  = gemm_info.n;
+    _k                                  = gemm_info.k;
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
+    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
+    build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
+    build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
+    build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
+    build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
+    build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    if (_has_pad_y)
+    {
+        build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+        build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+    }
+
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+    std::string kernel_name("gemm_mm_reshaped_only_rhs_");
+    kernel_name += rhs_info.transpose ? "t" : "nt";
+    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (_has_pad_y ? "" : "no_pad_y_");
+    _config_id += (_add_bias ? "add_bias_" : "");
+    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+    _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
+    _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
+    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+    _config_id += lower_string(string_from_data_type(src0->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(gemm_info.k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.h0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.interleave);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo       *src0,
+                                                           const ITensorInfo       *src1,
+                                                           const ITensorInfo       *src2,
+                                                           const ITensorInfo       *dst,
+                                                           float                    alpha,
+                                                           float                    beta,
+                                                           const GEMMLHSMatrixInfo &lhs_info,
+                                                           const GEMMRHSMatrixInfo &rhs_info,
+                                                           const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+    return Status{};
+}
+
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                       const Window     &window,
+                                                       cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
+
+    if (src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u;
+    const size_t rhs_idx_batch_size = 2u;
+    const size_t bia_idx_batch_size = 2u;
+    const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u;
+
+    Window slice          = window.first_slice_window_3D();
+    Window slice_matrix_b = slice;
+
+    slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
+    slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+    // Get cross plane pads
+    const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom;
+    const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom;
+
+    // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor
+    ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0)));
+
+    cl::Image2D src1_image2d;
+
+    if (_export_to_cl_image)
+    {
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
+        const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
+
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+    }
+
+    do
+    {
+        Window slice_b = slice;
+        // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
+        // This scenario can happen when the matrix multiplication is used to perform a convolution operation
+        if (!_slide_matrix_b)
+        {
+            slice_b = slice_matrix_b;
+        }
+
+        unsigned int idx = 0;
+
+        // LHS buffer
+        add_2D_tensor_argument(idx, src0, slice);
+
+        // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
+        if (_export_to_cl_image)
+        {
+            _kernel.setArg(idx++, src1_image2d);
+        }
+        else
+        {
+            add_2D_tensor_argument(idx, src1, slice_b);
+        }
+
+        // Bias buffer (_add_bias == true)
+        add_2D_tensor_argument_if(_add_bias, idx, src2, slice);
+
+        // dst buffer
+        add_2D_tensor_argument(idx, dst, slice);
+
+        // LHS stride_z
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size]));
+
+        // RHS stride_z (not used if _export_to_cl_image == true)
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
+
+        // Bias stride_z (if _add_bias == true)
+        if (_add_bias)
+        {
+            _kernel.setArg<cl_uint>(idx++,
+                                    static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
+        }
+
+        // dst stride_z
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
+
+        // Cross-plan padding (if _reinterpret_input_as_3d = true)
+        if (_reinterpret_input_as_3d && _has_pad_y)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
+        }
+
+        // Cross-plan padding (if reinterpret_output_as_3d = true)
+        if (_reinterpret_output_as_3d && _has_pad_y)
+        {
+            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
+        }
+
+        // Pass m, n and k at runtime as signed ints, to ensure results of any subractions they could be operand in, would still be signed.
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
+        enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
new file mode 100644
index 0000000000..e8fd78d476
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices when only the input matrix RHS (src1) has been reshaped
+ *
+ * @note The input matrix src1 must be reshaped through @ref ClGemmReshapeRhsMatrixKernel
+ */
+class ClGemmMatrixMultiplyReshapedOnlyRhsKernel : public ICLKernel
+{
+public:
+    ClGemmMatrixMultiplyReshapedOnlyRhsKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function.
+     *       Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer,
+     *       the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement
+     *       -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true).
+     *                             The number of dimensions for the LHS matrix must be less or equal than 4.
+     * @param[in]  src1            Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3.
+     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
+     * @param[out] dst             Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of the matrix bias
+     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported:
+     *                             lhs_info.m0: 1,2,3,4,5,6,7,8
+     * @param[in]  rhs_info        RHS matrix information used for reshaping the src1 tensor.  Only the following values are supported:
+     *                             rhs_info.k0: 2,3,4,8,16
+     *                             rhs_info.n0: 2,3,4,8,16
+     *                             rhs_info.transpose: true,false
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    bool       _has_pad_y{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
new file mode 100644
index 0000000000..4ca4b83f9c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+using ElementsProcessed = Steps;
+
+// Block size dimensions for the MMUL extension
+constexpr int mmul_m0 = 4;
+constexpr int mmul_n0 = 4;
+constexpr int mmul_k0 = 4;
+
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1, "Only values greater than 0 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 &&
+                                        rhs_info.n0 != 8 && rhs_info.n0 != 16,
+                                    "Only 1,2,3,4,8, and 16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 != 1 || lhs_info.k0 != 1), "Only 1 is supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.h0 != 4), "Only 4 is supported for h0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true,
+                                    "Only true is supported for interleave with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false,
+                                    "Only false is supported for transpose with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
+    ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
+
+    const unsigned int m = gemm_info.m;
+    const unsigned int n = gemm_info.n;
+    const unsigned int k = gemm_info.k;
+
+    ARM_COMPUTE_UNUSED(m);
+    ARM_COMPUTE_UNUSED(n);
+    ARM_COMPUTE_UNUSED(k);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
+
+    // Validate the reinterpreted-as-3D-case
+    if (gemm_info.reinterpret_input_as_3d)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
+    }
+    else
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
+    }
+
+    // Validate the gemm-batched case
+    if (src1->num_dimensions() > 2)
+    {
+        if (gemm_info.depth_output_gemm3d != 0)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(3) != src1->dimension(2));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(2) != src1->dimension(2));
+        }
+    }
+
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    {
+        const unsigned int src2_dim0 = src2->dimension(0);
+        const unsigned int src2_dim1 = src2->dimension(1);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
+        if (gemm_info.broadcast_bias)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix");
+        }
+    }
+
+    TensorShape tensor_shape1{src1->tensor_shape()};
+    tensor_shape1.set(0, n);
+    tensor_shape1.set(1, k);
+
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(src0, src1, src2);
+    bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+    TensorInfo tmp_info(*dst);
+
+    if (reinterpret_output_as_3d)
+    {
+        // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(dst->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    Window win = calculate_max_window(tmp_info, Steps(1, 1));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
+    Window             collapsed             = win.collapse(win, dimension_to_collapse);
+
+    // Reconfigure window size, one arm_matrix_multiply kernel needs 16 threads to finish.
+    Window::Dimension x_dimension = collapsed.x();
+    Window::Dimension y_dimension = collapsed.y();
+
+    // Make M and N multiple of M0 and N0 respectively
+    const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(x_dimension.end(), rhs_info.n0);
+    const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(y_dimension.end(), lhs_info.m0);
+
+    // Divide M and N by M0 and N0 respectively
+    const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / rhs_info.n0;
+    const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / lhs_info.m0;
+
+    // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_k0 respectively
+    const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
+    const unsigned int ceil_to_multiple_m_div_m0_mmul_k0 = ceil_to_multiple(m_div_m0, mmul_k0);
+
+    // Ensure x_dimension is multiple of MMUL block size (mmul_n0 * mmul_k0)
+    x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_k0);
+    y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_k0 / mmul_k0);
+
+    collapsed.set(Window::DimX, x_dimension);
+    collapsed.set(Window::DimY, y_dimension);
+
+    return std::make_pair(Status{}, collapsed);
+}
+} // namespace
+
+ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext  &compile_context,
+                                                              ITensorInfo             *src0,
+                                                              ITensorInfo             *src1,
+                                                              ITensorInfo             *src2,
+                                                              ITensorInfo             *dst,
+                                                              float                    alpha,
+                                                              float                    beta,
+                                                              const GEMMLHSMatrixInfo &lhs_info,
+                                                              const GEMMRHSMatrixInfo &rhs_info,
+                                                              const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+
+    auto padding_info   = get_padding_info({src0, src1, src2, dst});
+    _add_bias           = src2 != nullptr;
+    _export_to_cl_image = rhs_info.export_to_cl_image;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+
+    IClKernel::configure_internal(win_config.second);
+
+    _m = gemm_info.m;
+    _n = gemm_info.n;
+    _k = gemm_info.k;
+
+    const unsigned int m0_leftover = _m % lhs_info.m0;
+    const unsigned int n0_leftover = _n % rhs_info.n0;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
+    build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
+    build_opts.add_option_if(src0->data_type() == DataType::F16, "-DHALF_PRECISION");
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+    build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+    build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+    build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+    build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+
+    build_opts.add_option_if(gemm_info.reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(gemm_info.depth_output_gemm3d != 0, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(src1->num_dimensions() > 2, "-DBATCHED_RHS");
+
+    std::string kernel_name("gemm_mm_reshaped_only_rhs_nt_mmul");
+    kernel_name += _export_to_cl_image ? "_texture" : "";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += (_add_bias ? "add_bias_" : "");
+    _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : "");
+    _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : "");
+    _config_id += lower_string(string_from_data_type(src0->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_m);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_n);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(rhs_info.n0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo       *src0,
+                                                               const ITensorInfo       *src1,
+                                                               const ITensorInfo       *src2,
+                                                               const ITensorInfo       *dst,
+                                                               float                    alpha,
+                                                               float                    beta,
+                                                               const GEMMLHSMatrixInfo &lhs_info,
+                                                               const GEMMRHSMatrixInfo &rhs_info,
+                                                               const GEMMKernelInfo    &gemm_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              src2 != nullptr ? src2->clone().get() : nullptr,
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info)
+                                    .first);
+
+    return Status{};
+}
+
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
+    ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
+
+    if (src1->info()->num_dimensions() < 3)
+    {
+        // The stride_z for matrix B must be zero if we do not slice
+        ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
+    }
+
+    cl::Image2D src1_image2d;
+
+    if (_export_to_cl_image)
+    {
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
+        const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
+
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+    }
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+
+        add_3d_tensor_nhw_argument(idx, src0);
+        if (_export_to_cl_image)
+        {
+            _kernel.setArg(idx++, src1_image2d);
+        }
+        add_3d_tensor_nhw_argument(idx, src1);
+
+        // Bias buffer (_add_bias == true)
+        if (_add_bias)
+        {
+            add_3d_tensor_nhw_argument(idx, src2);
+        }
+        // dst buffer
+        add_3d_tensor_nhw_argument(idx, dst);
+
+        // Pass m, n and k at runtime as signed ints, to ensure results of any subtractions they could be operand in, would still be signed.
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
+        // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
+        // LWS also enforces the order of execution of the workitems which improves cache utilization
+        enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
new file mode 100644
index 0000000000..86d3012f6e
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to multiply matrices using MMUL when only the input matrix RHS (src1) has been reshaped */
+class ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel : public IClKernel
+{
+public:
+    ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel);
+    /** Initialize the kernel's input and dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Input tensor for the LHS matrix. Data type supported: F16/F32.
+     * @param[in]  src1            Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0.
+     * @param[in]  src2            Input tensor containing the bias matrix. Data type supported: same as @p src0.
+     * @param[out] dst             dst tensor info. Data type supported: same as @p src0
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of the matrix bias
+     * @param[in]  lhs_info        LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported:
+     *                             lhs_info.m0 > 0
+     *                             lhs_info.k0: 1
+     * @param[in]  rhs_info        RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported:
+     *                             rhs_info.n0: 1,2,3,4,8,16
+     *                             rhs_info.k0: same of lhs_info.k0
+     *                             rhs_info.transpose: false
+     * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
+     */
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
new file mode 100644
index 0000000000..eea2a169a3
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo       *src,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          bool                     reinterpret_input_as_3d)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
+    ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    }
+
+    return Status{};
+}
+
+Window
+configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+{
+    const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
+    const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
+
+    TensorInfo tmp_info(*src);
+
+    if (reinterpret_input_as_3d)
+    {
+        // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
+        // the window needs to be constructed on the 2D collapsed version of the tensor
+        TensorShape tmp_shape(src->tensor_shape());
+        tmp_shape.collapse(2U, 1U);
+        tmp_info.set_tensor_shape(tmp_shape);
+    }
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(
+                                 *src, lhs_info, reinterpret_input_as_3d)));
+
+    // Configure window
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win.collapse(win, Window::DimZ);
+
+    return collapsed;
+}
+} // namespace
+
+ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMLHSMatrixInfo &lhs_info,
+                                             bool                     reinterpret_input_as_3d)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
+
+    auto padding_info = get_padding_info({src});
+
+    const unsigned int src_w      = src->dimension(0);
+    const unsigned int m          = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
+    const unsigned int partial_m0 = m % lhs_info.m0;
+    const unsigned int partial_k0 = src_w % lhs_info.k0;
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
+    build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE");
+    build_opts.add_option_if_else(lhs_info.transpose, "-DRESHAPE_LHS_T", "-DRESHAPE_LHS_NT");
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+    build_opts.add_option("-DPARTIAL_M0=" + support::cpp11::to_string(partial_m0));
+    build_opts.add_option("-DPARTIAL_K0=" + support::cpp11::to_string(partial_k0));
+
+    std::string kernel_name("gemm_reshape_lhs_matrix_");
+    kernel_name += lhs_info.transpose ? "t" : "nt";
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    auto win_config = configure_window(src, dst, lhs_info, reinterpret_input_as_3d);
+    ICLKernel::configure_internal(win_config);
+
+    unsigned int idx = 2 * num_arguments_per_3d_tensor_nhw();
+    _kernel.setArg<cl_int>(idx++, m);
+    _kernel.setArg<cl_int>(idx++, lhs_info.v0);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "gemm_reshape_lhs_matrix_";
+    _config_id += (reinterpret_input_as_3d ? "3d_" : "");
+    _config_id += lower_string(string_from_data_type(src->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.k0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.v0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.interleave);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(lhs_info.transpose);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
+    return Status{};
+}
+
+void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3d_tensor_nhw_argument(idx, src);
+        add_3d_tensor_nhw_argument(idx, dst);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
new file mode 100644
index 0000000000..8e84e8ad8e
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H
+#define ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication.
+ *  In particular, this function splits the src matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and
+ *  stores each one in the dst matrix unrolling the values
+ */
+class ClGemmReshapeLhsMatrixKernel : public ICLKernel
+{
+public:
+    ClGemmReshapeLhsMatrixKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeLhsMatrixKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context       The compile context to be used.
+     * @param[in]  src                   Input tensor. Data types supported: All
+     * @param[out] dst                   Output tensor. Data type supported: same as @p src
+     * @param[in]  lhs_info              LHS matrix information to be used for reshaping. This object contains all the necessary
+     *                                   information to reshape the src tensor. Only the following values are supported:
+     *                                   lhs_info.m0: 2,3,4,5,6,7,8
+     *                                   lhs_info.k0: 2,3,4,8,16
+     *                                   lhs_info.v0: greater than 0
+     *                                   lhs_info.transpose: true, false
+     *                                   lhs_info.interleave: true, false
+     * @param[in]  reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor
+     */
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   bool                     reinterpret_src_as_3d = false);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo       *src,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           bool                     reinterpret_src_as_3d);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
new file mode 100644
index 0000000000..b9ce3873c7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)),
+                                    "Only 1,2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
+    ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    if (rhs_info.export_to_cl_image)
+    {
+        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1,
+                                              src->data_type());
+        ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
+    }
+
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+{
+    const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
+    const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
+    bool               window_changed                      = false;
+
+    // dst auto initialization if not yet initialized
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
+
+    // Configure window
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+
+    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x,
+                                     num_elems_processed_per_iteration_y);
+
+    window_changed = update_window_and_padding(win, src_access);
+
+    if (rhs_info.export_to_cl_image)
+    {
+        gemm::update_padding_for_cl_image(dst);
+    }
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win.collapse(win, Window::DimZ);
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
+}
+} // namespace
+
+ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMRHSMatrixInfo &rhs_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, rhs_info));
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
+    build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE");
+    build_opts.add_option_if(rhs_info.transpose, "-DRESHAPE_RHS_T");
+    build_opts.add_option_if(!rhs_info.transpose, "-DRESHAPE_RHS_NT");
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+
+    std::string kernel_name("gemm_reshape_rhs_matrix_");
+    kernel_name += rhs_info.transpose ? "t" : "nt";
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, dst, rhs_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    unsigned int idx = 2 * num_arguments_per_3d_tensor_nhw();
+    _kernel.setArg<cl_int>(idx++, rhs_info.h0);
+}
+
+Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMRHSMatrixInfo &rhs_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first);
+
+    return Status{};
+}
+
+void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    Window slice = window.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3d_tensor_nhw_argument(idx, src);
+        add_3d_tensor_nhw_argument(idx, dst);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (window.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
new file mode 100644
index 0000000000..7203d574fb
--- /dev/null
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H
+#define ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication
+ *  In particular, this kernel splits the src matrix in blocks of size K0xN0 and stores each one in
+ *  the dst matrix unrolling the values */
+class ClGemmReshapeRhsMatrixKernel : public ICLKernel
+{
+public:
+    ClGemmReshapeRhsMatrixKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeRhsMatrixKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor,
+     *       required to create a OpenCL image object from buffer in @ref ClGemmMatrixMultiplyReshapedKernel and in @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel
+     *       Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required:
+     *       -# rhs_info.n0 can only be 4, 8 and 16
+     *       -# rhs_info.k0 can only be 4, 8 and 16
+     *       -# Data type can only be F32, F16
+     *       -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension
+     *       -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4)
+     *       -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT
+     *       -# The output tensor should be only consumed by @ref ClGemmMatrixMultiplyReshapedKernel or @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Input tensor. Data types supported: All
+     * @param[out] dst             Output tensor. Data type supported: same as @p src
+     * @param[in]  rhs_info        RHS matrix information to be used for reshaping. This object contains all the necessary
+     *                             information to reshape the src tensor. Only the following values are supported:
+     *                             rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                             rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true)
+     *                             rhs_info.h0: greater than 0
+     *                             rhs_info.transpose: true, false
+     *                             rhs_info.interleave: true, false
+     */
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMRHSMatrixInfo &rhs_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
new file mode 100644
index 0000000000..2e1cefc6e7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+ClHeightConcatenateKernel::ClHeightConcatenateKernel() : _height_offset(0)
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst));
+    return Status{};
+}
+
+void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                          ITensorInfo            *src,
+                                          unsigned int            height_offset,
+                                          ITensorInfo            *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
+
+    auto padding_info = get_padding_info({src, dst});
+
+    _height_offset = height_offset;
+
+    // Add build options
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0));
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+    _depth = src->dimension(2);
+
+    std::string kernel_name = "concatenate_height";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+    // Configure kernel window
+
+    // The window needs to be based on src as we copy all the heights of src
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, src, window);
+    add_4D_tensor_argument(idx, dst, window);
+    _kernel.setArg<cl_uint>(idx++, _depth);
+    enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
new file mode 100644
index 0000000000..5a391a1212
--- /dev/null
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the height concatenate kernel.
+ *  The source tensor will be concatenated into the destination tensor.
+ */
+class ClHeightConcatenateKernel : public IClKernel
+{
+public:
+    ClHeightConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClHeightConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: All.
+     * @param[in]  height_offset   The starting offset on the Y axis for the dst tensor.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     *
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClHeightConcatenateKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    unsigned int _height_offset;
+    int32_t      _depth{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
new file mode 100644
index 0000000000..ef7a52828f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClIm2ColKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <cmath>
+#include <tuple>
+#include <utility>
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+struct Im2ColConfiguration
+{
+    std::string           kernel_name{};
+    std::set<std::string> build_options{};
+    unsigned int          num_elems_processed_per_iteration{};
+    bool                  is_padding_required_nchw{};
+};
+
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *dst,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups)
+{
+    const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::NHWC && num_groups > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(channel_idx) % num_groups) != 0);
+
+    // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
+    const unsigned int width_idx    = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx   = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+    const unsigned     total_width  = src->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+    const unsigned     total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
+    ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
+
+    if (dst->total_size() > 0)
+    {
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(
+            compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo         *src,
+                                                        ITensorInfo         *dst,
+                                                        const Size2D        &kernel_dims,
+                                                        const PadStrideInfo &conv_info,
+                                                        bool                 has_bias,
+                                                        const Size2D        &dilation,
+                                                        unsigned int         num_elems_processed_per_iteration,
+                                                        bool                 is_padding_required_nchw,
+                                                        unsigned int         num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto initialization if not yet initialized
+    TensorShape expected_output_shape =
+        compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
+
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape));
+
+    const DataLayout   data_layout  = src->data_layout();
+    const unsigned int width_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int input_width  = src->dimension(width_idx);
+    const unsigned int input_height = src->dimension(height_idx);
+
+    // Configure the execute window based on the selected optimal OpenCL kernel
+    bool   window_changed = false;
+    Window win;
+
+    if (data_layout == DataLayout::NHWC)
+    {
+        win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    }
+    else
+    {
+        if (is_padding_required_nchw)
+        {
+            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(),
+                                    conv_info.pad_left());
+            win = calculate_max_window(
+                *src, Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
+            AccessWindowStatic input_access(
+                src, -border.left, -border.top,
+                ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
+                input_height + border.bottom);
+            window_changed = window_changed || update_window_and_padding(win, input_access);
+        }
+        else
+        {
+            // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so
+            // update_window_and_padding() can be skipped
+            win = calculate_max_window(*src, Steps());
+        }
+    }
+
+    // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
+    win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo   *src,
+                                            const Size2D        &kernel_dims,
+                                            const PadStrideInfo &conv_info,
+                                            bool                 has_bias,
+                                            const Size2D        &dilation,
+                                            unsigned int         num_groups)
+{
+    const DataLayout   data_layout   = src->data_layout();
+    const DataType     data_type     = src->data_type();
+    const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int input_width   = src->dimension(width_idx);
+    const unsigned int input_height  = src->dimension(height_idx);
+    const unsigned int input_channel = src->dimension(channel_idx);
+
+    const std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+
+    // Im2Col configuration
+    std::string                   kernel_name = "im2col_generic_";
+    CLBuildOptions                build_opts;
+    unsigned int                  num_elems_processed_per_iteration = 1;
+    bool                          is_padding_required_nchw          = false;
+    const UniformQuantizationInfo qinfo                             = src->quantization_info().uniform();
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size()));
+    build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
+    build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
+    build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first));
+    build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first));
+    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second));
+    build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+    build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+    build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right()));
+    build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel));
+    build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+    build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
+    build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
+    build_opts.add_option_if_else(is_data_type_quantized(data_type),
+                                  "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
+    build_opts.add_option_if(has_bias, "-DHAS_BIAS");
+
+    if (data_layout == DataLayout::NHWC)
+    {
+        num_elems_processed_per_iteration = std::min(2U, input_channel);
+        is_padding_required_nchw          = false;
+
+        // Only the 3x3 and 9x9 cases are optimized for NHWC
+        if (kernel_dims == Size2D(3U, 3U))
+        {
+            kernel_name = "im2col3x3_";
+            build_opts.add_option("-DIM2COL_3X3");
+        }
+        else if (kernel_dims == Size2D(9U, 9U))
+        {
+            kernel_name = "im2col9x9_";
+            build_opts.add_option("-DIM2COL_9X9");
+        }
+        else
+        {
+            build_opts.add_option("-DIM2COL_GENERIC");
+        }
+
+        // Get boundary vector (the first/last vector with potentially a partial vector size) size
+        // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size
+        // otherwise, the boundary vec size is the (partial) remainder vector size
+        const unsigned int vec_size          = num_elems_processed_per_iteration;
+        const unsigned int partial_vec_size  = input_channel % vec_size;
+        const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size);
+        build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size));
+        build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size));
+    }
+    else
+    {
+        if (dilation == Size2D(1U, 1U))
+        {
+            const bool squared_im2col = kernel_dims.width == kernel_dims.height;
+            if (squared_im2col)
+            {
+                // Check if we can run an optimized im2col for NCHW
+                switch (kernel_dims.width)
+                {
+                    case 1:
+                        // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
+                        if (conv_info.stride().first == 1 && !conv_info.has_padding())
+                        {
+                            kernel_name                       = "im2col1x1_stridex1_";
+                            num_elems_processed_per_iteration = 4;
+                            is_padding_required_nchw          = true;
+                        }
+                        break;
+                    case 3:
+                        kernel_name                       = "im2col3x3_";
+                        num_elems_processed_per_iteration = 1;
+                        is_padding_required_nchw          = true;
+                        break;
+                    case 5:
+                        kernel_name                       = "im2col5x5_";
+                        num_elems_processed_per_iteration = 1;
+                        is_padding_required_nchw          = true;
+                        break;
+                    case 11:
+                        // Optimized im2col11x11 if pad_x = pad_y = 0
+                        if (!conv_info.has_padding())
+                        {
+                            kernel_name                       = "im2col11x11_padx0_pady0_";
+                            num_elems_processed_per_iteration = 1;
+                            is_padding_required_nchw          = true;
+                        }
+                        break;
+                    default:
+                        kernel_name                       = "im2col_generic_";
+                        num_elems_processed_per_iteration = 1;
+                        is_padding_required_nchw          = false;
+                        break;
+                }
+            }
+            else if (kernel_dims.width > 1 && !conv_info.has_padding())
+            {
+                kernel_name                       = "im2col_generic_padx0_pady0_";
+                num_elems_processed_per_iteration = 1;
+                is_padding_required_nchw          = false;
+
+                // Optimized im2col is performed using one or more vector operations with the specified vector size
+                // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+                // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+                // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+                // Using the vector size of 8, however, may be faster.
+                // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+                // is used instead.)
+                const size_t vector_size           = std::min(static_cast<size_t>(4), kernel_dims.width);
+                const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+                build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+                build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
+            }
+        }
+    }
+
+    // Append the data layout to the kernel_name
+    kernel_name += lower_string(string_from_data_layout(data_layout));
+
+    Im2ColConfiguration im2col_config;
+    im2col_config.kernel_name                       = kernel_name;
+    im2col_config.build_options                     = build_opts.options();
+    im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration;
+    im2col_config.is_padding_required_nchw          = is_padding_required_nchw;
+
+    return im2col_config;
+}
+} // namespace
+
+ClIm2ColKernel::ClIm2ColKernel()
+    : _data_layout(DataLayout::UNKNOWN),
+      _convolved_dims(),
+      _num_elems_processed_per_iteration(1),
+      _kernel_dims(),
+      _conv_info(),
+      _num_groups()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClIm2ColKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &kernel_dims,
+                               const PadStrideInfo    &conv_info,
+                               bool                    has_bias,
+                               const Size2D           &dilation,
+                               unsigned int            num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
+
+    auto padding_info = get_padding_info({src, dst});
+    _data_layout      = src->data_layout();
+
+    const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int input_width  = src->dimension(width_idx);
+    const unsigned int input_height = src->dimension(height_idx);
+
+    // Select and configure the optimal OpenCL kernel to run.
+    // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
+    // and the padding requirement flag
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
+
+    _convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
+    _kernel_dims                       = kernel_dims; // Only needed by the Tuner
+    _conv_info                         = conv_info;   // Only needed by the Tuner
+    _num_groups                        = num_groups;
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation,
+                                                    im2col_config.num_elems_processed_per_iteration,
+                                                    im2col_config.is_padding_required_nchw, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = im2col_config.kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(src->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(num_groups);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(_data_layout));
+
+    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
+}
+
+Status ClIm2ColKernel::validate(const ITensorInfo   *src,
+                                const ITensorInfo   *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims,
+                                                              conv_info, has_bias, dilation,
+                                                              im2col_config.num_elems_processed_per_iteration,
+                                                              im2col_config.is_padding_required_nchw, num_groups)
+                                    .first);
+    return Status{};
+}
+
+void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(tensors.empty());
+
+    // Get initial windows
+    // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    window_collapsed.set_dimension_step(Window::DimZ, 1);
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    Window window_output;
+    window_output.use_tensor_dimensions(dst->info()->tensor_shape());
+
+    const Window first_slice_3d = window_collapsed.first_slice_window_3D();
+
+    Window slice     = first_slice_3d;
+    Window slice_in  = first_slice_3d;
+    Window slice_out = window_output.first_slice_window_2D();
+
+    if (_data_layout == DataLayout::NHWC)
+    {
+        const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
+        const int    num_batches = tmp_win[3].end();
+
+        slice.set(1, Window::Dimension(0, static_cast<int>(dst->info()->tensor_shape()[1]), 1));
+        slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1));
+    }
+    else
+    {
+        slice.set(0,
+                  Window::Dimension(
+                      0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)),
+                      _num_elems_processed_per_iteration));
+        slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
+        // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
+    }
+
+    // Setup input slice
+    // The dimensions of the input are increased within the OpenCL kernel
+    slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Setup output slice
+    // The dimensions of the output are increased within the OpenCL kernel
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    unsigned int idx = num_arguments_per_3D_tensor() +
+                       (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
+    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
+    _kernel.setArg<cl_uint>(idx++,
+                            static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice_in);
+        if (_num_groups == 1)
+        {
+            add_2D_tensor_argument(idx, dst, slice_out);
+        }
+        else
+        {
+            add_3D_tensor_argument(idx, dst, slice_out);
+        }
+        enqueue(queue, *this, slice, lws_hint());
+    } while (window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) &&
+             window_collapsed.slide_window_slice_3D(slice_in));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h
new file mode 100644
index 0000000000..c8cd5b328d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_IM2COL_KERNEL_H
+#define ARM_COMPUTE_CL_IM2COL_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Size2D.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the im2col reshape kernel.
+ *
+ * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
+ * It is used to transform a convolution to a plain matrix multiplication.
+ *
+ * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have:
+ * @f[
+ * \left( \begin{array}{cccc}
+ * a00 & a01 & a02 & a03 \\
+ * a10 & a11 & a12 & a13 \\
+ * a20 & a21 & a22 & a23 \\
+ * a30 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * =
+ * \left( \begin{array}{ccccccccc}
+ * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\
+ * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\
+ * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\
+ * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\
+ * \end{array} \right)
+ * @f]
+ */
+class ClIm2ColKernel : public IClKernel
+{
+public:
+    /** Default constructor */
+    ClIm2ColKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIm2ColKernel);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32
+     * @param[out] dst             The output tensor info. First 2 lower dimensions represent a transform of each 3D input,
+     *                             while every dimension above represents a batch. Data types supported: Same as @p input
+     * @param[in]  kernel_dims     The kernel dimensions (width and height).
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  has_bias        In case biases are provided expands the matrix with 1.
+     * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &kernel_dims,
+                   const PadStrideInfo    &conv_info,
+                   bool                    has_bias,
+                   const Size2D           &dilation   = Size2D(1U, 1U),
+                   unsigned int            num_groups = 1);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClIm2ColKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *output,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation   = Size2D(1U, 1U),
+                           unsigned int         num_groups = 1);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    DataLayout                            _data_layout;
+    std::pair<unsigned int, unsigned int> _convolved_dims;
+    unsigned int                          _num_elems_processed_per_iteration;
+    Size2D                                _kernel_dims;
+    PadStrideInfo                         _conv_info;
+    unsigned int                          _num_groups;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_IM2COL_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
new file mode 100644
index 0000000000..8c493d08c6
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
+
+    // Checks performed when dst is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                                  weights->tensor_shape(), conv_info, desc));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClIndirectConv2dAddressPrecalculationKernel::ClIndirectConv2dAddressPrecalculationKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext            &compile_context,
+                                                            ITensorInfo                       *src,
+                                                            ITensorInfo                       *weights,
+                                                            ITensorInfo                       *dst,
+                                                            const PadStrideInfo               &conv_info,
+                                                            const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info, desc));
+
+    constexpr unsigned int width_idx  = 1;
+    constexpr unsigned int height_idx = 2;
+
+    // Get dst shape
+    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
+
+    TensorShape output_conv_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, output_shape, 1, DataType::S32);
+
+    // Configure kernel window
+    Window win;
+
+    // Create window and update padding
+    win = calculate_max_window(output_shape, Steps(1));
+
+    ICLKernel::configure_internal(win);
+
+    std::stringstream kernel_name;
+    CLBuildOptions    build_options;
+
+    kernel_name << "indirect_convolution_address_precalculation";
+
+    const unsigned int pad_left      = conv_info.pad_left();
+    const unsigned int pad_top       = conv_info.pad_top();
+    const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
+    const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
+    const auto         dst_data_type = dst->data_type();
+
+    build_options.add_option("-DSRC_CONV_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx)));
+    build_options.add_option("-DSRC_CONV_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx)));
+    build_options.add_option("-DDST_CONV_WIDTH=" + support::cpp11::to_string(output_conv_shape[width_idx]));
+    build_options.add_option("-DDST_CONV_HEIGHT=" + support::cpp11::to_string(output_conv_shape[height_idx]));
+    build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
+    build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
+    build_options.add_option("-DWEI_CONV_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
+    build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x));
+    build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y));
+    build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+    build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+    build_options.add_option("-DM0=" + support::cpp11::to_string(desc.m0));
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_options.add_option("-D" + upper_string(kernel_name.str()));
+
+    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
+
+    // Since this kernel should be called only once, we do not need to set the config_id for tuning
+}
+
+Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo                 *src,
+                                                             const ITensorInfo                 *weights,
+                                                             const ITensorInfo                 *dst,
+                                                             const PadStrideInfo               &conv_info,
+                                                             const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info, desc));
+    return Status{};
+}
+
+void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack      &tensors,
+                                                         const Window     &window,
+                                                         cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Get initial windows
+    const Window slice = window.first_slice_window_3D();
+
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    unsigned int idx = 0;
+    add_4d_tensor_nhwc_argument(idx, dst);
+    enqueue(queue, *this, slice);
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
new file mode 100644
index 0000000000..b565609c6a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_KERNEL_H
+#define ARM_COMPUTE_CL_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declarations
+struct DirectConvComputeKernelInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the  direct convolution kernel. */
+class ClIndirectConv2dAddressPrecalculationKernel : public IClKernel
+{
+public:
+    ClIndirectConv2dAddressPrecalculationKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIndirectConv2dAddressPrecalculationKernel);
+    /** Set the src, weights, biases and dst tensors info.
+     *
+     * @note: When M0 is 5,6,7, the kernel rounds up M0 to the nearest power of two. Therefore, eight. The reason behind
+     *        this implementation detail is because we can exploit native opencl stores in the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The src tensor info to convolve. 3 lower dimensions represent a single src [IFM, width, height],
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
+     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [IFM, kernel_x, kernel_y, OFM].
+     *                             The 1st dimension must be the same as the src's volume 1st dimension.
+     *                             Data type supported:Same as @p src.
+     * @param[out] dst             Output tensor info where to store the precalculated offsets. Data types supported: S32.
+     *                             The output is a 3D tensor with the following dimensions: [M0 x Kw x Kh, ceil(M/M0), batch-size], where:
+     *                             Kw=Kernel width, Kh=Kernel height, M0=number of rows processed by each workitem, and M=dst_width x dst_height.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct/indirect convolution kernel.
+     */
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const DirectConvComputeKernelInfo &desc);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClIndirectConv2dAddressPreCalculationKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const DirectConvComputeKernelInfo &desc);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_INDIRECT_CONV2D_ADDRESS_PRECALCULATION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
new file mode 100644
index 0000000000..3510b6970c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLUtils.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *indirect_buffer,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indirect_buffer, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        indirect_buffer->tensor_shape(),
+        misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                              weights->tensor_shape(), conv_info, desc));
+
+    constexpr int channel_idx = 0;
+    constexpr int batch_idx   = 3;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
+                                    "N0 can only be: 1, 2, 3, 4, 8, and 16");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
+                                    "K0 can only be: 1, 2, 3, 4, 8, and 16");
+
+    if (desc.export_weights_to_cl_image)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights),
+                                        "Export to CLImage is not supported for this weight configuration");
+    }
+
+    if (biases != nullptr)
+    {
+        if (is_data_type_quantized_asymmetric(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
+    }
+
+    // Checks performed when dst is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClIndirectConv2dKernel::ClIndirectConv2dKernel()
+{
+    _type = CLKernelType::DIRECT;
+}
+
+void ClIndirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                       ITensorInfo                       *src,
+                                       ITensorInfo                       *weights,
+                                       ITensorInfo                       *biases,
+                                       ITensorInfo                       *indirect_buffer,
+                                       ITensorInfo                       *dst,
+                                       const PadStrideInfo               &conv_info,
+                                       const ActivationLayerInfo         &act_info,
+                                       const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, indirect_buffer, dst);
+
+    // Perform validation
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+
+    constexpr unsigned int channel_idx   = 0;
+    constexpr unsigned int width_idx     = 1;
+    constexpr unsigned int height_idx    = 2;
+    const unsigned int     kernel_width  = weights->dimension(width_idx);
+    const unsigned int     kernel_height = weights->dimension(height_idx);
+    const DataType         data_type     = src->data_type();
+
+    const GPUTarget gpu_target = get_target();
+
+    // Get dst shape
+    TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
+
+    // Configure kernel window
+    Window win;
+    output_shape.collapse(2U, 1U);
+    const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
+    const unsigned int m0 = adjust_vec_size(desc.m0, output_shape[1]);
+    const unsigned int k0 = adjust_vec_size(desc.k0, src->dimension(channel_idx));
+
+    const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0;
+
+    // Create window and update padding
+    win = calculate_max_window(output_shape, Steps(n0, m0));
+
+    ICLKernel::configure_internal(win);
+
+    std::stringstream kernel_name;
+    CLBuildOptions    build_options;
+
+    kernel_name << "indirect_convolution_nhwc";
+
+    _export_to_cl_image = desc.export_weights_to_cl_image;
+
+    // Update the padding for the weights tensor if we can export to cl_image
+    if (_export_to_cl_image)
+    {
+        gemm::update_padding_for_cl_image(weights);
+    }
+
+    // Add padding to indirect buffer to avoid out-of-bound reads
+    // When M0 is 5, 6, and 7, we use vload8 to fetch the data from the buffer
+    const unsigned int load_indirect_buf_size = m0 > 4 ? 8 : m0;
+    const unsigned int indirect_buf_width     = indirect_buffer->tensor_shape()[0];
+    const unsigned int round_up_width =
+        ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
+    const unsigned int padding = round_up_width - indirect_buf_width;
+    indirect_buffer->extend_padding(PaddingSize(0, indirect_buffer->padding().right + padding, 0, 0));
+
+    if (biases != nullptr)
+    {
+        build_options.add_option(std::string("-DHAS_BIAS"));
+        build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
+    }
+
+    // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+    const auto act_function = act_info.activation();
+
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (data_type == DataType::F32 || data_type == DataType::F16))
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_options.add_option("-cl-unsafe-math-optimizations");
+    }
+    else
+    {
+        build_options.add_option("-cl-fast-relaxed-math");
+    }
+
+    build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+    build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx)));
+    build_options.add_option("-DOFF_TENSOR_TYPE=BUFFER");
+    build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx)));
+    build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx)));
+    build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
+    build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_options.add_option_if_else(_export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+    build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(kernel_width));
+    build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(kernel_height));
+    build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+    build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+    build_options.add_option("-DIND_BUFF_VEC_SIZE=" + support::cpp11::to_string(load_indirect_buf_size));
+    build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
+    build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
+    build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+    build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_options.add_option("-D" + upper_string(kernel_name.str()));
+
+    if (compile_context.get_ddk_version() >= 30)
+    {
+        build_options.add_option("-fregister-allocation=64");
+    }
+
+    _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name.str();
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(kernel_width);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(kernel_height);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(width_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(height_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(channel_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(width_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(height_idx));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
+}
+
+Status ClIndirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                        const ITensorInfo                 *weights,
+                                        const ITensorInfo                 *biases,
+                                        const ITensorInfo                 *indirect_buffer,
+                                        const ITensorInfo                 *dst,
+                                        const PadStrideInfo               &conv_info,
+                                        const ActivationLayerInfo         &act_info,
+                                        const DirectConvComputeKernelInfo &desc)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    return Status{};
+}
+
+void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto indirect_buffer =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    cl::Image2D weights_cl_image;
+
+    if (_export_to_cl_image)
+    {
+        const size_t image_w = weights->info()->dimension(0) / 4;
+        const size_t image_h =
+            weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
+        const TensorShape shape2d(image_w, image_h);
+        const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
+
+        // Export cl_buffer to cl_image
+        weights_cl_image =
+            create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d,
+                                       weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+    }
+
+    unsigned int idx = 0;
+    add_4d_tensor_nhwc_argument(idx, src);
+    add_4d_tensor_nhwc_argument(idx, indirect_buffer);
+    add_4d_tensor_nhwc_argument(idx, dst);
+    if (_export_to_cl_image)
+    {
+        _kernel.setArg(idx++, weights_cl_image);
+    }
+    add_4d_tensor_nhwc_argument(idx, weights);
+    if (biases != nullptr)
+    {
+        add_1D_tensor_argument(idx, biases, slice);
+    }
+    enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
new file mode 100644
index 0000000000..04166d417e
--- /dev/null
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declaration
+struct DirectConvComputeKernelInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the  indirect convolution kernel. */
+class ClIndirectConv2dKernel : public IClKernel
+{
+public:
+    ClIndirectConv2dKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIndirectConv2dKernel);
+    /** Set the src, offset, weights, biases and dst tensors info.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The src tensor info to convolve. 3 lower dimensions represent a single src [IFM, width, height],
+     *                             while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
+     * @param[in]  off             The indirect buffer tensor info. Data types supported: S32.
+     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [IFM, kernel_x, kernel_y, OFM].
+     *                             Data type supported: Same as @p src.
+     * @param[in]  biases          Biases tensor info. Biases are 1D tensor with dimension [OFM].
+     *                             Data type supported: Same as @p src.
+     * @param[out] dst             Output tensor info.
+     *                             The 3rd dimension must be equal to the 4th dimension of the @p weights tensor. Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
+     * @param[in]  desc            Direct convolution descriptor used to build the NHWC indirect convolution kernel.
+     */
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *off,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClIndirectConv2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *off,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    bool _export_to_cl_image{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
new file mode 100644
index 0000000000..0bb6b0c083
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
@@ -0,0 +1,262 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/QuantizationInfo.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+    const bool adj_lhs = matmul_kernel_info.adj_lhs;
+    const bool adj_rhs = matmul_kernel_info.adj_rhs;
+    const int  m0      = matmul_kernel_info.m0;
+    const int  n0      = matmul_kernel_info.n0;
+    const int  k0      = matmul_kernel_info.k0;
+
+    // Validate M0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+    if (adj_lhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+    }
+
+    // Validate N0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
+
+    // Validate K0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
+    if (!adj_lhs || adj_rhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
+    }
+
+    return Status{};
+}
+} // namespace
+ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo         *lhs,
+                                          const ITensorInfo         *rhs,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *dst,
+                                          const MatMulKernelInfo    &matmul_kernel_info,
+                                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY &&
+                                     act_info.activation() != ActivationFunction::RELU &&
+                                     act_info.activation() != ActivationFunction::LU_BOUNDED_RELU &&
+                                     act_info.activation() != ActivationFunction::BOUNDED_RELU),
+                                    "Activation Function specified is unsupported.");
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+    }
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
+    }
+
+    return Status{};
+}
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *lhs,
+                                         ITensorInfo               *rhs,
+                                         ITensorInfo               *bias,
+                                         ITensorInfo               *dst,
+                                         const MatMulKernelInfo    &matmul_kernel_info,
+                                         const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+    const int  m       = dst->dimension(1);
+    const int  n       = dst->dimension(0);
+    const int  k       = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+    const bool adj_lhs = matmul_kernel_info.adj_lhs;
+
+    int m0 = adj_lhs ? adjust_vec_size(matmul_kernel_info.m0, m) : std::min(matmul_kernel_info.m0, m);
+    int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(n0, m0));
+    win        = win.collapse(win, Window::DimZ);
+    IClKernel::configure_internal(win);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = m % m0;
+    const unsigned int partial_store_n0 = n % n0;
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(k));
+
+    const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
+    const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
+    const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
+
+    float multiplier        = lqinfo.scale * rqinfo.scale / dqinfo.scale;
+    int   output_multiplier = 0;
+    int   output_shift      = 0;
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+    build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+    build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+
+    // Note : Offset is not negated, unlike gemmlowp kernels
+    build_opts.add_option("-DLHS_OFFSET=" + support::cpp11::to_string(lqinfo.offset));
+    build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(rqinfo.offset));
+    build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset));
+    build_opts.add_option_if(bias != nullptr, "-DBIAS");
+
+    // Floating point boundaries are quantized prior to being passed as arguments.
+    // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
+    int a_val{};
+    int b_val{};
+    std::tie(b_val, a_val) = get_quantized_activation_min_max(act_info, dst->data_type(), dqinfo);
+
+    build_opts.add_option("-DA_VAL=" + support::cpp11::to_string(a_val));
+    build_opts.add_option("-DB_VAL=" + support::cpp11::to_string(b_val));
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+    build_opts.add_option("-DZERO_POINT=" + support::cpp11::to_string(dqinfo.offset));
+
+    std::string kernel_name("mat_mul_native_quantized");
+    kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+    kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    const size_t number_of_batches = dst->tensor_shape().total_size() / (m * n);
+
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(lhs->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(number_of_batches);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(matmul_kernel_info.k0);
+}
+
+void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+
+    unsigned int idx              = 0;
+    Window       window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+
+    add_3d_tensor_nhw_argument(idx, lhs);
+    add_3d_tensor_nhw_argument(idx, rhs);
+    if (bias != nullptr)
+    {
+        add_3d_tensor_nhw_argument(idx, bias);
+    }
+    add_3d_tensor_nhw_argument(idx, dst);
+
+    enqueue(queue, *this, window_collapsed, lws_hint());
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
new file mode 100644
index 0000000000..ffdb720855
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declerations
+struct MatMulKernelInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulLowpNativeKernel : public IClKernel
+{
+public:
+    ClMatMulLowpNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  lhs                Input tensor info for the LHS matrix. Data type supported: QASYMM8_SIGNED/QASYMM8.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]  rhs                Input tensor info for the RHS matrix. Data type supported: same as @p lhs.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]  bias               Bias tensor info. Can be nullptr. Data type supported: S32.
+     * @param[out] dst                Output tensor info. Data type supported: same as @p lhs
+     * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
+     * @param[in]  act_info           (Optional) Class containing information about fused activation function.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMatMulLowpNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL */
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
new file mode 100644
index 0000000000..1df0ca0410
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+// Block size dimensions for the MMUL extension
+constexpr int mmul_m0 = 4;
+constexpr int mmul_n0 = 4;
+constexpr int mmul_k0 = 16;
+
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+    const bool adj_lhs = matmul_kernel_info.adj_lhs;
+    const int  m0      = matmul_kernel_info.m0;
+    const int  n0      = matmul_kernel_info.n0;
+    const int  k0      = matmul_kernel_info.k0;
+
+    // Validate M0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+    if (adj_lhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+    }
+
+    // Validate N0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
+
+    // Validate K0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 4), "Only 4 is supported for k0");
+
+    // Validate ExportToCLImage
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(matmul_kernel_info.export_rhs_to_cl_image, "Export to CLImage is not supported!");
+
+    return Status{};
+}
+} // namespace
+
+ClMatMulLowpNativeMMULKernel::ClMatMulLowpNativeMMULKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo         *lhs,
+                                              const ITensorInfo         *rhs,
+                                              const ITensorInfo         *bias,
+                                              const ITensorInfo         *dst,
+                                              const MatMulKernelInfo    &matmul_kernel_info,
+                                              const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+
+    const TensorShape &lhs_shape = lhs->tensor_shape();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs_shape, rhs->tensor_shape(), matmul_kernel_info));
+
+    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY),
+                                    "Activation Function specified is unsupported.");
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+    }
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(expected_output_shape[0] != bias->dimension(0));
+    }
+
+    return Status{};
+}
+
+void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext    &compile_context,
+                                             ITensorInfo               *lhs,
+                                             ITensorInfo               *rhs,
+                                             ITensorInfo               *bias,
+                                             ITensorInfo               *dst,
+                                             const MatMulKernelInfo    &matmul_kernel_info,
+                                             const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info, act_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+    ARM_COMPUTE_UNUSED(compile_context, lhs, rhs, bias, matmul_kernel_info, act_info);
+    CLBuildOptions build_opts;
+
+    const int m = dst->dimension(1);
+    const int n = dst->dimension(0);
+    const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+    const int m0 = std::min(matmul_kernel_info.m0, m);
+    const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks
+    // at the end of a row/column if any. This is to avoid padding.
+    const unsigned int m0_leftover = m % m0;
+    const unsigned int n0_leftover = n % n0;
+
+    // Configure kernel window
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second);
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+    build_opts.add_option("-DM=" + support::cpp11::to_string(m));
+    build_opts.add_option("-DN=" + support::cpp11::to_string(n));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(k));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
+    build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+    build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+    build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+    build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+    build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+    build_opts.add_option_if(bias != nullptr, "-DBIAS");
+
+    const UniformQuantizationInfo lqinfo = lhs->quantization_info().uniform();
+    const UniformQuantizationInfo rqinfo = rhs->quantization_info().uniform();
+    const UniformQuantizationInfo dqinfo = dst->quantization_info().uniform();
+
+    float multiplier        = lqinfo.scale * rqinfo.scale / dqinfo.scale;
+    int   output_multiplier = 0;
+    int   output_shift      = 0;
+    arm_compute::quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+
+    build_opts.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+    build_opts.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+
+    // Note : Offset is not negated, unlike gemmlowp kernels
+    build_opts.add_option("-DLHS_OFFSET=" + support::cpp11::to_string(lqinfo.offset));
+    build_opts.add_option("-DRHS_OFFSET=" + support::cpp11::to_string(rqinfo.offset));
+    build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(dqinfo.offset));
+
+    std::string kernel_name("mat_mul_native_quantized_mmul");
+    kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+    kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(lhs->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n0);
+}
+
+void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    auto *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+
+    unsigned int idx = 0;
+    add_3d_tensor_nhw_argument(idx, lhs);
+    add_3d_tensor_nhw_argument(idx, rhs);
+
+    if (bias != nullptr)
+    {
+        add_3d_tensor_nhw_argument(idx, bias);
+    }
+    add_3d_tensor_nhw_argument(idx, dst);
+
+    // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
+    // LWS also enforces the order of execution of the work items which improves cache utilization
+    enqueue(queue, *this, window, cl::NDRange(32, 2), false);
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
new file mode 100644
index 0000000000..e84d46d34a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+// Forward declerations
+struct MatMulKernelInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulLowpNativeMMULKernel : public IClKernel
+{
+public:
+    ClMatMulLowpNativeMMULKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulLowpNativeMMULKernel);
+
+    /** Initialise the kernel's input and output.
+     *
+     * Similar to @ref ClMatMulLowpNativeKernel::configure()
+     *
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMatMulLowpNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
new file mode 100644
index 0000000000..a1fa9fa9ab
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
@@ -0,0 +1,284 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/CLUtils.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+    const bool adj_lhs = matmul_kernel_info.adj_lhs;
+    const bool adj_rhs = matmul_kernel_info.adj_rhs;
+    const int  m0      = matmul_kernel_info.m0;
+    const int  n0      = matmul_kernel_info.n0;
+    const int  k0      = matmul_kernel_info.k0;
+
+    // Validate M0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+    if (adj_lhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+    }
+
+    // Validate N0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
+
+    // Validate K0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
+    if (!adj_lhs || adj_rhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
+    }
+
+    return Status{};
+}
+
+Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings());
+    if (matmul_kernel_info.export_rhs_to_cl_image)
+    {
+        if (matmul_kernel_info.adj_rhs)
+        {
+            const int k0 = matmul_kernel_info.k0;
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16,
+                                            "K0 can only be: 4, 8, and 16 for Rhs transposed");
+        }
+        else
+        {
+            const int n0 = matmul_kernel_info.n0;
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16,
+                                            "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs),
+                                        "Export to CLImage is not supported for this device/configuration");
+    }
+
+    return Status{};
+}
+} // namespace
+ClMatMulNativeKernel::ClMatMulNativeKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+Status ClMatMulNativeKernel::validate(const ITensorInfo         *lhs,
+                                      const ITensorInfo         *rhs,
+                                      const ITensorInfo         *bias,
+                                      const ITensorInfo         *dst,
+                                      const MatMulKernelInfo    &matmul_kernel_info,
+                                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info));
+
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+    }
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, lhs);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
+    }
+
+    return Status{};
+}
+void ClMatMulNativeKernel::configure(const ClCompileContext    &compile_context,
+                                     ITensorInfo               *lhs,
+                                     ITensorInfo               *rhs,
+                                     ITensorInfo               *bias,
+                                     ITensorInfo               *dst,
+                                     const MatMulKernelInfo    &matmul_kernel_info,
+                                     const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+    const int  m       = dst->dimension(1);
+    const int  n       = dst->dimension(0);
+    const int  k       = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+    const bool adj_lhs = matmul_kernel_info.adj_lhs;
+
+    int m0 = adj_lhs ? adjust_vec_size(matmul_kernel_info.m0, m) : std::min(matmul_kernel_info.m0, m);
+    int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+    _export_rhs_to_cl_image = matmul_kernel_info.export_rhs_to_cl_image && !rhs->lock_paddings();
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(n0, m0));
+    win        = win.collapse(win, Window::DimZ);
+    IClKernel::configure_internal(win);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int partial_store_m0 = m % m0;
+    const unsigned int partial_store_n0 = n % n0;
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DK0=" + support::cpp11::to_string(matmul_kernel_info.k0));
+    build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
+    build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
+    build_opts.add_option("-DK=" + support::cpp11::to_string(k));
+    build_opts.add_option_if(bias != nullptr, "-DBIAS");
+    build_opts.add_option_if_else(_export_rhs_to_cl_image, "-DRHS_TENSOR_TYPE=IMAGE", "-DRHS_TENSOR_TYPE=BUFFER");
+
+    // Define values for activation function
+    build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())));
+    build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())));
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+
+    std::string kernel_name("mat_mul_native");
+    kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+    kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    if (_export_rhs_to_cl_image)
+    {
+        gemm::update_padding_for_cl_image(rhs);
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(lhs->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_export_rhs_to_cl_image);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(matmul_kernel_info.k0);
+}
+
+void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+
+    unsigned int idx              = 0;
+    Window       window_collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+
+    add_3d_tensor_nhw_argument(idx, lhs);
+
+    cl::Image2D rhs_cl_image;
+    if (_export_rhs_to_cl_image)
+    {
+        const size_t      image_w = rhs->info()->dimension(0) / 4;
+        const size_t      image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0);
+        const TensorShape shape2d(image_w, image_h);
+        const size_t      image_row_pitch = rhs->info()->strides_in_bytes()[1];
+
+        // Export cl_buffer to cl_image
+        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d,
+                                                  rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        _kernel.setArg(idx++, rhs_cl_image);
+    }
+
+    add_3d_tensor_nhw_argument(idx, rhs);
+    if (bias != nullptr)
+    {
+        add_3d_tensor_nhw_argument(idx, bias);
+    }
+    add_3d_tensor_nhw_argument(idx, dst);
+
+    enqueue(queue, *this, window_collapsed, lws_hint());
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
new file mode 100644
index 0000000000..2cb150bc8f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulNativeKernel : public IClKernel
+{
+public:
+    ClMatMulNativeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @param[in]  compile_context    The compile context to be used.
+     * @param[in]  lhs                Input tensor info for the LHS matrix. Data type supported: F32/F16.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]  rhs                Input tensor info for the RHS matrix. Data type supported: same as @p lhs.
+     *                                Dimensions above 2 are collapsed onto dimension 2 and represent the batch.
+     * @param[in]  bias               Bias tensor info for bias matrix. Can be nullptr. Data type supported: same as @p lhs.
+     * @param[out] dst                Output tensor info. Data type supported: same as @p lhs
+     * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
+     * @param[in]  act_info           (Optional) Specifies activation function to use after Matrix multiplication. Default is Identity function.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMatMulNativeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    bool _export_rhs_to_cl_image{false};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL */
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
new file mode 100644
index 0000000000..76bf846e74
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+// Block size dimensions for the MMUL extension
+constexpr int mmul_m0 = 4;
+constexpr int mmul_n0 = 4;
+constexpr int mmul_k0 = 4;
+
+Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info)
+{
+    const bool adj_lhs = matmul_kernel_info.adj_lhs;
+    const int  m0      = matmul_kernel_info.m0;
+    const int  n0      = matmul_kernel_info.n0;
+    const int  k0      = matmul_kernel_info.k0;
+
+    // Validate M0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
+
+    if (adj_lhs)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+    }
+
+    // Validate N0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
+
+    // Validate K0
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 1), "Only 1 is supported for k0");
+
+    return Status{};
+}
+} // namespace
+ClMatMulNativeMMULKernel::ClMatMulNativeMMULKernel()
+{
+    _type = CLKernelType::GEMM;
+}
+
+Status ClMatMulNativeMMULKernel::validate(const ITensorInfo      *lhs,
+                                          const ITensorInfo      *rhs,
+                                          const ITensorInfo      *bias,
+                                          const ITensorInfo      *dst,
+                                          const MatMulKernelInfo &matmul_kernel_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
+
+    const TensorShape &lhs_shape = lhs->tensor_shape();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs_shape, rhs->tensor_shape(), matmul_kernel_info));
+
+    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
+
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
+    }
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, bias);
+    }
+
+    return Status{};
+}
+void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context,
+                                         ITensorInfo            *lhs,
+                                         ITensorInfo            *rhs,
+                                         ITensorInfo            *bias,
+                                         ITensorInfo            *dst,
+                                         const MatMulKernelInfo &matmul_kernel_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
+
+    // dst tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+
+    const int m = dst->dimension(1);
+    const int n = dst->dimension(0);
+    const int k = matmul_kernel_info.adj_lhs ? lhs->tensor_shape().y() : lhs->tensor_shape().x();
+
+    _m = m;
+    _n = n;
+    _k = k;
+
+    const int m0 = std::min(matmul_kernel_info.m0, m);
+    const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+    // Configure kernel window
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second);
+
+    // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
+    const unsigned int m0_leftover = m % m0;
+    const unsigned int n0_leftover = n % n0;
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(lhs->data_type()));
+    build_opts.add_option_if(lhs->data_type() == DataType::F16, "-DHALF_PRECISION");
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DM0_LEFTOVER=" + support::cpp11::to_string(m0_leftover));
+    build_opts.add_option("-DN0_LEFTOVER=" + support::cpp11::to_string(n0_leftover));
+    build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
+    build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
+    build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
+    build_opts.add_option_if(bias != nullptr, "-DBIAS");
+
+    std::string kernel_name("mat_mul_native_mmul");
+    kernel_name += matmul_kernel_info.adj_lhs ? "_t" : "_nt";
+    kernel_name += matmul_kernel_info.adj_rhs ? "_t" : "_nt";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(lhs->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(k);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(matmul_kernel_info.k0);
+}
+
+void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
+    unsigned int idx = 0;
+
+    add_3d_tensor_nhw_argument(idx, lhs);
+    add_3d_tensor_nhw_argument(idx, rhs);
+    if (bias != nullptr)
+    {
+        add_3d_tensor_nhw_argument(idx, bias);
+    }
+    add_3d_tensor_nhw_argument(idx, dst);
+
+    // Pass m and n at runtime as signed ints, to ensure results of any subtractions they could be operand in, would still be signed.
+    _kernel.setArg<cl_int>(idx++, _m);
+    _kernel.setArg<cl_int>(idx++, _n);
+    _kernel.setArg<cl_int>(idx++, _k);
+
+    // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
+    // LWS also enforces the order of execution of the work items which improves cache utilization
+    enqueue(queue, *this, window, cl::NDRange(32, 2), false);
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
new file mode 100644
index 0000000000..1aeb896325
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEMMULKERNEL
+#define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEMMULKERNEL
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+struct MatMulKernelInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClMatMulNativeMMULKernel : public IClKernel
+{
+public:
+    ClMatMulNativeMMULKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMatMulNativeMMULKernel);
+    /** Initialize the kernel's input and output.
+     *
+     * This kernel performs matrix multiplication of lhs and rhs:
+     *
+     *  dst = matmul(lhs, rhs)
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |F32            |F32            |
+     * |F16            |F16            |F16            |
+     *
+     * Shape definitions:
+     *       Dim0, Dim1,       Dim2...
+     * lhs: [   K,    M, Batch dims...]
+     * rhs: [   N,    K, Batch dims...]
+     * dst: [   N,    M, Batch dims...]
+     *
+     * Valid shape configurations:
+     * - K must be a multiple of 4 (MMUL_K0).
+     * - No broadcasting in batch dimensions. I.e. batch dims must be the same across lhs, rhs and dst
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  lhs             Input tensor info for the LHS matrix.
+     * @param[in]  rhs             Input tensor info for the RHS matrix.
+     * @param[in]  bias            Bias tensor info. Can be nullptr. Data type supported: Same as @p lhs.
+     * @param[out] dst             Output tensor info.
+     * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *lhs,
+                   ITensorInfo            *rhs,
+                   ITensorInfo            *bias,
+                   ITensorInfo            *dst,
+                   const MatMulKernelInfo &matmul_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMatMulNativeMMULKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *lhs,
+                           const ITensorInfo      *rhs,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           const MatMulKernelInfo &matmul_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    int _m{1};
+    int _n{1};
+    int _k{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEMMULKERNEL */
diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp
new file mode 100644
index 0000000000..3b59c2a7fc
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMulKernel.cpp
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClMulKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo         *src1,
+                          const ITensorInfo         *src2,
+                          const ITensorInfo         *dst,
+                          float                      scale,
+                          ConvertPolicy              overflow_policy,
+                          RoundingPolicy             rounding_policy,
+                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(overflow_policy);
+    ARM_COMPUTE_UNUSED(rounding_policy);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
+
+    // Check whether it is in_place calculation
+    const bool in_place      = (src1 == dst) || (src2 == dst);
+    const bool src1_in_place = in_place && (src1 == dst);
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+
+    // Validate in case of configured dst
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                             DataType::F16, DataType::S32, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 &&
+                                            (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
+                                        "Dst can only be U8 if both src are U8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8 &&
+                (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
+            "Dst can only be QASYMM8 if both src are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8_SIGNED &&
+                (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
+            "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QSYMM16 &&
+                (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
+            "Dst can only be QSYMM16 if both src are QSYMM16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) &&
+                                            (dst->data_type() != DataType::S32),
+                                        "Dst must be S32 if source tensors are S32");
+        if (in_place)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                detail::have_different_dimensions(out_shape,
+                                                  src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
+                "Wrong shape for dst, cannot do in_place calculation");
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                            "Wrong shape for dst");
+        }
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClMulKernel::ClMulKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClMulKernel::configure(const CLCompileContext    &compile_context,
+                            ITensorInfo               *src1,
+                            ITensorInfo               *src2,
+                            ITensorInfo               *dst,
+                            float                      scale,
+                            ConvertPolicy              overflow_policy,
+                            RoundingPolicy             rounding_policy,
+                            const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
+
+    auto padding_info = get_padding_info({src1, src2, dst});
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+    auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
+
+    int scale_int = -1;
+    // Extract sign, exponent and mantissa
+    int   exponent            = 0;
+    float normalized_mantissa = std::frexp(scale, &exponent);
+    // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
+    // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
+    // Moreover, it will be negative as we deal with 1/2^n
+    if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+    {
+        // Store the positive exponent. We know that we compute 1/2^n
+        // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
+        scale_int = std::abs(exponent - 1);
+    }
+
+    std::string acc_type;
+    // Check if it has float src and dst
+    if (is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
+    {
+        scale_int = -1;
+        acc_type  = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half";
+    }
+    else
+    {
+        if (src1->element_size() == 4 || src2->element_size() == 4)
+        {
+            // use 64 bit accumulator for 32-bit input
+            acc_type = "long";
+        }
+        else if (src1->element_size() == 2 || src2->element_size() == 2)
+        {
+            // Use 32-bit accumulator for 16-bit input
+            acc_type = "int";
+        }
+        else
+        {
+            // Use 16-bit accumulator for 8-bit input
+            acc_type = "ushort";
+        }
+    }
+
+    const bool         is_quantized      = is_data_type_quantized(src1->data_type());
+    const unsigned int vec_size          = adjust_vec_size(16 / dst->element_size(), dst->dimension(0));
+    const unsigned int vec_size_leftover = dst->dimension(0) % vec_size;
+
+    // Set kernel build options
+    std::string    kernel_name = "pixelwise_mul";
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
+    build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
+    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+    if (is_quantized && (dst->data_type() != DataType::S32))
+    {
+        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
+
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(src1->data_type()),
+                                 "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset));
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(src2->data_type()),
+                                 "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset));
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(dst->data_type()),
+                                 "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+        kernel_name += "_quantized";
+    }
+    else
+    {
+        kernel_name += (scale_int >= 0) ? "_int" : "_float";
+        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()),
+                                      "-DWRAP", "-DSATURATE");
+        build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
+        build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
+        if (act_info.enabled())
+        {
+            build_opts.add_option("-DACTIVATION_TYPE=" +
+                                  lower_string(string_from_activation_func(act_info.activation())));
+            build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+            build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+        }
+    }
+
+    // Check whether it is in_place calculation
+    const bool in_place      = (src1 == dst) || (src2 == dst);
+    const bool src1_in_place = in_place && (src1 == dst);
+    build_opts.add_option_if(in_place, "-DIN_PLACE");
+    build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE");
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set scale argument
+    unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+
+    if (scale_int >= 0 && !is_quantized)
+    {
+        _kernel.setArg(idx++, scale_int);
+    }
+    else
+    {
+        _kernel.setArg(idx++, scale);
+    }
+
+    Window win = calculate_max_window(*dst, Steps(vec_size));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(dst->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+}
+
+Status ClMulKernel::validate(const ITensorInfo         *src1,
+                             const ITensorInfo         *src2,
+                             const ITensorInfo         *dst,
+                             float                      scale,
+                             ConvertPolicy              overflow_policy,
+                             RoundingPolicy             rounding_policy,
+                             const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
+
+    return Status{};
+}
+
+void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
+
+    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+    const TensorShape &out_shape = dst->info()->tensor_shape();
+
+    bool can_collapse = true;
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice        = collapsed.first_slice_window_3D();
+    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+    // Check whether it is in_place calculation
+    const bool in_place = (src_0 == dst) || (src_1 == dst);
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src_0, slice_input1);
+        add_3D_tensor_argument(idx, src_1, slice_input2);
+        if (!in_place)
+        {
+            add_3D_tensor_argument(idx, dst, slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
+
+        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
+        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+
+namespace
+{
+constexpr unsigned int vec_size_complex = 1;
+
+Status validate_arguments_complex(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
+
+    // Validate in case of configured dst
+    if (dst->total_size() > 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClComplexMulKernel::ClComplexMulKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClComplexMulKernel::configure(const CLCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
+
+    auto padding_info = get_padding_info({src1, src2, dst});
+
+    const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
+    auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    if (act_info.enabled())
+    {
+        build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+        build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+        build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+    }
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options());
+
+    Window win = calculate_max_window(*dst, Steps(vec_size_complex));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClComplexMulKernel::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
+
+    return Status{};
+}
+
+void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+    const TensorShape &out_shape = dst->info()->tensor_shape();
+
+    bool can_collapse = true;
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    {
+        can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        {
+            can_collapse = (in_shape1[d] == in_shape2[d]);
+        }
+    }
+
+    bool   has_collapsed = false;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+
+    const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
+    const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
+
+    Window slice        = collapsed.first_slice_window_3D();
+    Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
+    Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src_0, slice_input1);
+        add_3D_tensor_argument(idx, src_1, slice_input2);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+
+        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
+        ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h
new file mode 100644
index 0000000000..76a3ce02c1
--- /dev/null
+++ b/src/gpu/cl/kernels/ClMulKernel.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2016-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_MUL_KERNEL_H
+#define ARM_COMPUTE_CL_MUL_KERNEL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the pixelwise multiplication kernel.
+ *
+ * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead.
+ *
+*/
+class ClMulKernel : public IClKernel
+{
+public:
+    ClMulKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMulKernel);
+    /** Initialise the kernel's src and dst.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (S32,S32)                       -> S32
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+     * @param[in]  src2            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+     * @param[out] dst             The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32
+     * @param[in]  scale           Scale to apply after multiplication.
+     *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]  overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMulKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+
+/** Interface for the complex pixelwise multiplication kernel. */
+class ClComplexMulKernel : public ICLKernel
+{
+public:
+    ClComplexMulKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexMulKernel);
+    /** Initialise the kernel's src and dst.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            An src tensor info. Data types supported: F32. Number of channels supported: 2.
+     * @param[in]  src2            An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+     * @param[out] dst             The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClComplexMulKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_MUL_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp
new file mode 100644
index 0000000000..a4755782ed
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClPermuteKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+TensorShape get_dst_shape(const ITensorInfo *src, const PermutationVector &perm)
+{
+    TensorShape dst_shape = src->tensor_shape();
+    permute(dst_shape, perm);
+    return dst_shape;
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() < 1 || src->num_dimensions() > 4,
+                                    "Permutation up to 4-D src tensor is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
+                                    "Permutation vector size should be less than or equal to 4");
+    for (const auto &p : perm)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
+    }
+
+    // Validate configured dst
+    if (dst->total_size() != 0)
+    {
+        const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+    return Status{};
+}
+} // namespace
+
+ClPermuteKernel::ClPermuteKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClPermuteKernel::configure(const CLCompileContext  &compile_context,
+                                const ITensorInfo       *src,
+                                ITensorInfo             *dst,
+                                const PermutationVector &perm)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    auto              padding_info = get_padding_info({src, dst});
+    const TensorShape dst_shape    = get_dst_shape(src, perm);
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm));
+
+    _perm = perm;
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
+    build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2)));
+    // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
+    build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
+    build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
+    build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
+    build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
+
+    _kernel = create_kernel(compile_context, "permute", build_opts.options());
+
+    // Configure  kernel window
+    Window win = calculate_max_window(*src, Steps());
+
+    ICLKernel::configure_internal(win);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm));
+
+    return Status{};
+}
+
+void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
+
+    // Setup dst slice
+    Window slice_out(slice_in);
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    slice_out.set(3, Window::Dimension(0, 0, 0));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, src, slice_in);
+        add_4D_tensor_argument(idx, dst, slice_out);
+        enqueue(queue, *this, slice_in, lws_hint());
+    } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h
new file mode 100644
index 0000000000..2413b10284
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPermuteKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_PERMUTE_KERNEL_H
+#define ARM_COMPUTE_CL_PERMUTE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform tensor permutation.
+ *
+ * Permutes given a permutation vector
+ */
+class ClPermuteKernel : public IClKernel
+{
+public:
+    ClPermuteKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPermuteKernel);
+    /** Set the src and dst of the kernel.
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             The src tensor info. Data types supported: All.
+     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
+     * @param[in] perm            Permutation vector
+     */
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClPermuteKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    PermutationVector _perm{};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_PERMUTE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp
new file mode 100644
index 0000000000..41ab4d6922
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClPool2dKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
+        "Unsupported combination of parameters!");
+
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const bool is_global_pooling = pool_info.is_global_pooling;
+    unsigned int pool_size_x     = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    unsigned int pool_size_y     = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    int          output_width    = 0;
+    int          output_height   = 0;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size_x,
+                                 pool_size_y, pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
+
+    // Check indices
+    if (indices)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)),
+                                        "Pooling indices only supported for pool size 2x2");
+
+        if (indices->total_size() != 0)
+        {
+            TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
+        }
+    }
+
+    // Checks performed when dst is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClPool2dKernel::ClPool2dKernel()
+{
+    _type = CLKernelType::POOL;
+}
+
+void ClPool2dKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const PoolingLayerInfo &pool_info,
+                               ITensorInfo            *indices)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
+
+    auto padding_info = get_padding_info({src, dst, indices});
+
+    // Auto init if empty
+    TensorShape out_shape = compute_pool_shape(*src, pool_info);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
+    if (indices)
+    {
+        auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
+    }
+
+    // Set instance variables
+    _pool_info   = pool_info;
+    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    _num_elems_processed_per_iteration =
+        (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
+    _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
+
+    int               pool_stride_x  = 0;
+    int               pool_stride_y  = 0;
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
+    const bool          exclude_padding    = pool_info.exclude_padding;
+    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
+    const int      pool_pad_top            = pad_stride_info.pad_top();
+    const int      pool_pad_left           = pad_stride_info.pad_left();
+    const DataType data_type               = src->data_type();
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
+    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+    build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
+    build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
+    build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
+    build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+    build_opts.add_option("-DMAX_WIDTH=" +
+                          support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" +
+                          support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+
+    // Tensor paddings are used to calculate the indicies for MAX pooling
+    if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+        is_data_type_float(data_type))
+    {
+        build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3)));
+    }
+
+    if (is_data_type_quantized_asymmetric(data_type))
+    {
+        build_opts.add_option("-DQUANTIZED");
+
+        if (src->quantization_info() != dst->quantization_info())
+        {
+            const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+            const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+            build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+            build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+            build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+            build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+        }
+    }
+
+    // Set the initial value for the pooling operation accordingly with the data type
+    if (pool_type == PoolingType::MAX)
+    {
+        if (is_data_type_quantized(data_type))
+        {
+            PixelValue type_min{};
+            std::tie(type_min, std::ignore) = get_min_max(data_type);
+            build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
+        }
+        else
+        {
+            std::string initial_value = pool_info.use_inf_as_limit
+                                            ? "(-INFINITY)"
+                                            : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+            build_opts.add_option("-DINITIAL_VALUE=" + initial_value);
+        }
+    }
+    else
+    {
+        // Pool AVG and Pool L2 initial value
+        build_opts.add_option("-DINITIAL_VALUE=0");
+    }
+
+    // Create kernel
+    switch (_data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
+            const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
+            const auto acc_data_type          = get_cl_type_from_data_type(
+                         use_wider_accumulator ? DataType::F32
+                                               : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
+            build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
+            build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
+
+            if (pool_type != PoolingType::MAX)
+            {
+                build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+            }
+
+            if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+                is_data_type_float(data_type))
+            {
+                // For max pooling with pool2x2, store indicies which will be used in max unpooling
+                std::string kernel_name = "pooling_layer_2_nchw_indices";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
+            else // Run general case
+            {
+                std::string kernel_name = "pooling_layer_MxN_nchw";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            // Floating point mixed precision is support on F16 only
+            const auto use_fp_mixed_precision =
+                (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+
+            // Wider accumulation is required to avoid accuracy loss
+            // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
+            // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
+            DataType acc_data_type = data_type;
+
+            if (use_fp_mixed_precision)
+            {
+                acc_data_type = DataType::F32;
+            }
+            else if (is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+            {
+                acc_data_type = DataType::S32;
+            }
+
+            build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
+            build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
+            build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+            build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+            build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+            build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
+            build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
+            build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
+            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                                  support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+            if (pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
+            {
+                build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
+
+                std::string kernel_name = "pooling_layer_2x2_nhwc";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
+            else
+            {
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type)
+                                              ? "pooling_layer_MxN_quantized_nhwc"
+                                              : "pooling_layer_MxN_nhwc";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "pooling_layer_";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(_data_layout));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_width));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_height));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_channel));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(src->data_layout()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClPool2dKernel::validate(const ITensorInfo      *src,
+                                const ITensorInfo      *dst,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
+    return Status{};
+}
+
+void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    unsigned int pool_stride_x             = 0;
+    unsigned int pool_stride_y             = 0;
+    std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
+
+    // Collapse window
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+    switch (_data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            Window slice = window_collapsed.first_slice_window_3D();
+            do
+            {
+                // Set srcs
+                unsigned int idx = 0;
+                add_3D_tensor_argument(idx, src, slice);
+                add_3D_tensor_argument(idx, dst, slice);
+                if (indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
+                {
+                    add_3D_tensor_argument(idx, indices, slice);
+                }
+                enqueue(queue, *this, slice, lws_hint());
+            } while (window_collapsed.slide_window_slice_3D(slice));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3);
+
+            Window slice    = window_collapsed.first_slice_window_4D();
+            Window in_slice = window_collapsed.first_slice_window_4D();
+            in_slice.set(Window::DimX,
+                         Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
+            in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
+            in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
+            in_slice.set(3, Window::Dimension(0, batch_size, 1));
+            do
+            {
+                // Set srcs
+                unsigned int idx = 0;
+                add_4D_tensor_argument(idx, src, in_slice);
+                add_4D_tensor_argument(idx, dst, slice);
+                if (indices && is_data_type_float(src->info()->data_type()) &&
+                    (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                {
+                    add_4D_tensor_argument(idx, indices, slice);
+                }
+                enqueue(queue, *this, slice, lws_hint());
+            } while (window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h
new file mode 100644
index 0000000000..56b95a37d5
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPool2dKernel.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H
+#define ARM_COMPUTE_CL_POOL2D_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class ClPool2dKernel : public IClKernel
+{
+public:
+    ClPool2dKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel);
+
+    /** Configure kernel for a given list of arguments
+     *
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
+     * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &pool_info,
+                   ITensorInfo            *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool2dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+public:
+    PoolingLayerInfo _pool_info{};
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp
new file mode 100644
index 0000000000..a08c5d4be7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClPool3dKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0),
+        "Strides cannot be zero.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
+                                    "Exclude padding is unsupported for non-float types for Avg op");
+
+    const auto         data_layout       = src->data_layout();
+    const int          idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int          idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int          idx_depth         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+    const bool         is_global_pooling = pool_info.is_global_pooling;
+    const unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const unsigned int pool_size_z       = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+    int                output_width      = 0;
+    int                output_height     = 0;
+    int                output_depth      = 0;
+
+    bool round_type_ceil_with_asymm_padding =
+        (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding,
+                                    "Cannot use dimension round type CEIL when padding is asymmetric.");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
+    // Checks performed when dst is configured
+    if (dst->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+        TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClPool3dKernel::ClPool3dKernel()
+{
+    _type = CLKernelType::POOL;
+}
+
+void ClPool3dKernel::configure(const ClCompileContext   &compile_context,
+                               const ITensorInfo        *src,
+                               ITensorInfo              *dst,
+                               const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
+    auto padding_info = get_padding_info({src, dst});
+
+    // Auto init if empty
+    TensorShape out_shape = compute_pool3d_shape(src->tensor_shape(), pool_info);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
+
+    // Set instance variables
+    _pool_info   = pool_info;
+    _data_layout = src->data_layout();
+
+    _num_elems_processed_per_iteration = (dst->data_type() == DataType::F32) ? 2 : 4;
+    _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
+
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_depth      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int      pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const int      pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+    const bool     exclude_padding = pool_info.exclude_padding;
+    const int      pool_stride_x   = pool_info.stride.x();
+    const int      pool_stride_y   = pool_info.stride.y();
+    const int      pool_stride_z   = pool_info.stride.z();
+    const int      pool_pad_top    = pool_info.padding.top;
+    const int      pool_pad_left   = pool_info.padding.left;
+    const int      pool_pad_front  = pool_info.padding.front;
+    const DataType data_type       = src->data_type();
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
+    build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+    build_opts.add_option("-DSTRIDE_Z=" + support::cpp11::to_string(pool_stride_z));
+    build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left));
+    build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
+    build_opts.add_option("-DPAD_Z=" + support::cpp11::to_string(pool_pad_front));
+    build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
+    build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+    build_opts.add_option("-DPOOL_SIZE_Z=" + support::cpp11::to_string(pool_size_z));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+    build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth)));
+
+    // If datatype is quantized add relevant parameters
+    if (is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    // Set the initial value for the pooling operation accordingly with the data type
+    if (pool_type == PoolingType::MAX)
+    {
+        if (is_data_type_quantized(data_type))
+        {
+            PixelValue type_min{};
+            std::tie(type_min, std::ignore) = get_min_max(data_type);
+            build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>()));
+        }
+        else
+        {
+            build_opts.add_option("-DINITIAL_VALUE=" +
+                                  float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+        }
+    }
+    else
+    {
+        // Pool AVG and Pool L2 initial value
+        build_opts.add_option("-DINITIAL_VALUE=0");
+    }
+    // Create kernel
+    // Floating point mixed precision is support on F16 only
+    const auto use_fp_mixed_precision =
+        (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+
+    // Wider accumulation is required to avoid accuracy loss
+    // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
+    DataType acc_data_type = data_type;
+    if (use_fp_mixed_precision)
+    {
+        acc_data_type = DataType::F32;
+    }
+    else if (is_data_type_quantized(data_type) &&
+             pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
+    {
+        acc_data_type = DataType::S32;
+    }
+
+    build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type));
+    build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION");
+    build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+    build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
+    build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst->dimension(idx_depth)));
+    build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
+    build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+
+    // if datatype is quantized use quantized kernel function
+    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized"
+                                                                            : "pooling_3d_layer_MxN_ndhwc");
+    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "pooling_layer_3d";
+    _config_id += lower_string(string_from_data_type(data_type));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(_data_layout));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_width));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_height));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(idx_channel));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(src->data_layout()));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClPool3dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info));
+    return Status{};
+}
+
+void ClPool3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+
+    // Collapse 3D window
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+
+    // Set CL kernel arguments
+    unsigned int idx = 0;
+    // Passing of the window not needed, as the steps are not used for the pool3d kernel
+    add_5D_tensor_argument(idx, src, window);
+    add_5D_tensor_argument(idx, dst, window);
+    enqueue(queue, *this, window_collapsed, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h
new file mode 100644
index 0000000000..6cd229c427
--- /dev/null
+++ b/src/gpu/cl/kernels/ClPool3dKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL3D_KERNEL_H
+#define ARM_COMPUTE_CL_POOL3D_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the pooling layer kernel */
+class ClPool3dKernel : public IClKernel
+{
+public:
+    ClPool3dKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool3dKernel);
+
+    /** Configure kernel for a given list of arguments
+     *
+     * @note Asymmetric padding is not supported when dimension rounding type == CEIL.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     * @param[in]  pool_info       Contains pooling operation information described in @ref Pooling3dLayerInfo.
+     */
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &pool_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool3dKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    Pooling3dLayerInfo _pool_info{};
+    DataLayout         _data_layout{DataLayout::UNKNOWN};
+    unsigned int       _num_elems_processed_per_iteration{1};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL3D_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
new file mode 100644
index 0000000000..e8df420f67
--- /dev/null
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
@@ -0,0 +1,184 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+
+    // Output must always be initialized
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
+
+    return Status{};
+}
+} // namespace
+
+ClQuantizeKernel::ClQuantizeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    auto padding_info = get_padding_info({src, dst});
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    const int  vec_size_x     = 16 / src->element_size();
+    const int  input_width_x  = src->tensor_shape().x();
+    const bool multi_access_x = (input_width_x / vec_size_x > 0);
+
+    const UniformQuantizationInfo qinfo            = dst->quantization_info().uniform();
+    const DataType                output_data_type = dst->data_type();
+
+    float   scale_to_apply  = qinfo.scale;
+    int32_t offset_to_apply = qinfo.offset;
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        /*
+         * In case of requantization of a quantized input tensor to an output tensor with another quantization
+         * instead of of apply dequantization and then a quantization functions, we just compute new scale and
+         * offset to apply.
+         *
+         * Assuming:
+         *   - q_i as input quantized value
+         *   - q_o as output quantized value
+         *   - z_i as input quantization offset value
+         *   - z_o as output quantization offset value
+         *   - s_i as input quantization scale value
+         *   - s_o as output quantization scale value
+         *   - z_n as new quantization offset value
+         *   - s_n as new quantization scale value
+         *
+         * q_o = ( q_i - z_i ) * s_i / s_o + z_o
+         *
+         * We can rewrite the formula as:
+         *
+         * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o
+         *
+         * q_o = q_i / s_n + z_n
+         *
+         * Where:
+         *
+         * s_n = s_o / s_i
+         *
+         * z_n = - z_i * s_i / s_o + z_o
+         *
+         */
+        const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform();
+        scale_to_apply /= qinfo_in.scale;
+        // In order to minimize flooring we convert the offset to a float,
+        // then compute the new offset in the float domain,
+        // finally we convert it back as int32_t
+        offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale);
+    }
+
+    // Create kernel
+    CLBuildOptions build_opts;
+    build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT");
+    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
+    build_opts.add_option_if(
+        multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    std::pair<int, int> min_max_quant_values =
+        quantization::get_min_max_values_from_quantized_data_type(output_data_type);
+    build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
+    build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
+
+    _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps());
+    if (multi_access_x)
+    {
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+    }
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+    return Status{};
+}
+
+void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (window_collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.h b/src/gpu/cl/kernels/ClQuantizeKernel.h
new file mode 100644
index 0000000000..aeab28febe
--- /dev/null
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
+#define ARM_COMPUTE_CL_QUANTIZE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 3D input tensors.
+ */
+class ClQuantizeKernel : public IClKernel
+{
+public:
+    ClQuantizeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizeKernel);
+    /** Set the input, output.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] dst             Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClQuantizeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_QUANTIZE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp
new file mode 100644
index 0000000000..53889f3a6b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClReshapeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+#include <string>
+
+/** [ClReshapeKernel Kernel] **/
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    if (dst->tensor_shape().total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size());
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClReshapeKernel::ClReshapeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClReshapeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
+
+    auto padding_info = get_padding_info({src, dst});
+
+    // Create kernel
+    std::set<std::string> build_opts = {"-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())};
+    _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
+
+    // Add static arguments
+    const cl_int2 src_shape = {
+        {static_cast<cl_int>(src->tensor_shape()[0]), static_cast<cl_int>(src->tensor_shape()[1])}};
+    const cl_int2 dst_shape = {
+        {static_cast<cl_int>(dst->tensor_shape()[0]), static_cast<cl_int>(dst->tensor_shape()[1])}};
+    unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
+    _kernel.setArg<cl_int2>(idx++, src_shape);
+    _kernel.setArg<cl_int2>(idx++, dst_shape);
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst);
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
+
+    return Status{};
+}
+
+void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice            = window_collapsed.first_slice_window_3D();
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Set srcs
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, src, window_collapsed);
+    add_3D_tensor_argument(idx, dst, window_collapsed);
+    enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+/** [ClReshapeKernel Kernel] **/
diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h
new file mode 100644
index 0000000000..95eae82086
--- /dev/null
+++ b/src/gpu/cl/kernels/ClReshapeKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_RESHAPE_KERNEL_H
+#define ARM_COMPUTE_CL_RESHAPE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the kernel to perform tensor reshaping */
+class ClReshapeKernel : public IClKernel
+{
+public:
+    ClReshapeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClReshapeKernel);
+    /** Set the src and dst of the kernel
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data type supported: All.
+     * @param[out] dst             Destination tensor info. Data type supported: Same as @p src
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClReshapeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
new file mode 100644
index 0000000000..4305acad26
--- /dev/null
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClScaleKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+inline std::tuple<float, float>
+calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
+{
+    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Compute the ratio between source width/height and destination width/height
+    const unsigned int src_width  = src->dimension(idx_width);
+    const unsigned int src_height = src->dimension(idx_height);
+    const unsigned int dst_width  = dst->dimension(idx_width);
+    const unsigned int dst_height = dst->dimension(idx_height);
+
+    float scale_x = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners);
+    float scale_y = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners);
+
+    return std::make_tuple(scale_x, scale_y);
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info.align_corners &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) &&
+                                !is_data_type_quantized_asymmetric(src->data_type()));
+
+    float            scale_x     = 0.f;
+    float            scale_y     = 0.f;
+    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+    std::tie(scale_x, scale_y)   = calculate_scale_factors(src, dst, data_layout, info.align_corners);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA &&
+                                (scale_x > 1.f || scale_y > 1.f));
+
+    return Status{};
+}
+} // namespace
+
+Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info));
+    return Status{};
+}
+
+ClScaleKernel::ClScaleKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClScaleKernel::configure(const CLCompileContext &compile_context,
+                              ITensorInfo            *src,
+                              ITensorInfo            *dst,
+                              const ScaleKernelInfo  &info)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
+    auto padding_info = get_padding_info({src, dst});
+
+    // Info required for the static tuning
+    _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
+
+    const bool is_nhwc = _data_layout == DataLayout::NHWC;
+
+    float scale_x              = 0.f;
+    float scale_y              = 0.f;
+    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
+
+    // Area interpolation behaves as Nearest Neighbour in case of up-sampling
+    auto interpolation_policy_to_use = info.interpolation_policy;
+    if (info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
+    {
+        interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
+    }
+
+    // Create kernel
+    const int          idx_width         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int          idx_height        = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int          idx_channel       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const unsigned int src_width         = src->dimension(idx_width);
+    const unsigned int src_height        = src->dimension(idx_height);
+    const unsigned int dst_width         = dst->dimension(idx_width);
+    const unsigned int dst_channels      = dst->dimension(idx_channel);
+    unsigned int       vec_size          = 0;
+    unsigned int       vec_size_leftover = 0;
+
+    CLBuildOptions build_opts;
+    if (_data_layout == DataLayout::NHWC)
+    {
+        vec_size          = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
+        vec_size_leftover = dst_channels % vec_size;
+        build_opts.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+        build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
+        build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
+        build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
+        build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
+        build_opts.add_option_if(src->num_dimensions() > 3, "-DBATCHED_EXECUTION");
+        build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+        build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+        build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
+        build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
+    }
+    else if (_data_layout == DataLayout::NCHW)
+    {
+        vec_size          = adjust_vec_size(4, dst_width);
+        vec_size_leftover = dst_width % vec_size;
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
+        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
+        build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
+        build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
+        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0)
+                                                            ? support::cpp11::to_string(vec_size)
+                                                            : support::cpp11::to_string(vec_size_leftover)));
+        build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
+        build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
+        build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
+
+        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) &&
+                                        info.interpolation_policy == InterpolationPolicy::BILINEAR;
+        if (is_qasymm_bilinear)
+        {
+            const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
+            build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
+            build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+        }
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR_ON("Unsupported data layout");
+    }
+
+    std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use);
+    std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower);
+    std::string kernel_name = "scale_" + interpolation_name + "_";
+    kernel_name += lower_string(string_from_data_layout(_data_layout));
+
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(vec_size));
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+    // Pass scale kernel arguments
+    if (is_nhwc)
+    {
+        unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
+        _kernel.setArg<cl_float>(idx++, scale_x);
+        _kernel.setArg<cl_float>(idx++, scale_y);
+    }
+    // Set config_id for enabling LWS tuning
+    _config_id = "scale_";
+    _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : "");
+    _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft");
+    _config_id += (is_nhwc ? "nhwc" : "nchw");
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(3));
+}
+
+void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    switch (_data_layout)
+    {
+        case DataLayout::NCHW:
+        {
+            Window slice = window.first_slice_window_2D();
+
+            do
+            {
+                unsigned int idx = 0;
+                add_2D_tensor_argument(idx, src, slice);
+                add_2D_tensor_argument(idx, dst, slice);
+                enqueue(queue, *this, slice, lws_hint());
+            } while (window.slide_window_slice_2D(slice));
+            break;
+        }
+        case DataLayout::NHWC:
+        {
+            Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ);
+            Window slice     = collapsed.first_slice_window_4D();
+
+            unsigned int idx = 0;
+            add_4d_tensor_nhwc_argument(idx, src);
+            add_4d_tensor_nhwc_argument(idx, dst);
+            enqueue(queue, *this, slice, lws_hint());
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Data layout not supported");
+    }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h
new file mode 100644
index 0000000000..c09659017d
--- /dev/null
+++ b/src/gpu/cl/kernels/ClScaleKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SCALE_KERNEL_H
+#define ARM_COMPUTE_CL_SCALE_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the scale kernel */
+class ClScaleKernel : public IClKernel
+{
+public:
+    ClScaleKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScaleKernel);
+    /** Initialise the kernel's inputs, output and interpolation policy
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[out] dst             Destination tensor info. Data types supported: Same as @p src
+     *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClScaleKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    DataLayout _data_layout{DataLayout::UNKNOWN};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_SCALE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClScatterKernel.cpp b/src/gpu/cl/kernels/ClScatterKernel.cpp
new file mode 100644
index 0000000000..19adc1ef34
--- /dev/null
+++ b/src/gpu/cl/kernels/ClScatterKernel.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClScatterKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+#include <cstdint>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+namespace
+{
+constexpr int max_index_length = 5;
+} // namespace
+
+ClScatterKernel::ClScatterKernel()
+{
+}
+
+Status ClScatterKernel::validate(const ITensorInfo *updates,
+                                 const ITensorInfo *indices,
+                                 const ITensorInfo *dst,
+                                 const ScatterInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+
+    const TensorShape &ind_shape = indices->tensor_shape();
+    const TensorShape &upt_shape = updates->tensor_shape();
+    const TensorShape &dst_shape = dst->tensor_shape();
+
+    const int32_t upt_dims = upt_shape.num_dimensions();
+    const int32_t dst_dims = dst_shape.num_dimensions();
+    const int32_t ind_dims = ind_shape.num_dimensions();
+    const int32_t data_dim = upt_dims - (ind_dims - 1); // Number of batch dims is the number of indices dims - 1
+
+    const int32_t index_len = ind_shape[0];
+    bool          unsupported_padding_config =
+        (dst_dims == index_len) && index_len > 1 && (dst->has_padding() || updates->has_padding());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(unsupported_padding_config, "Padding is not supported with these shapes.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(updates, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(indices, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32, DataType::F16, DataType::S32, DataType::S16,
+                                                 DataType::S8, DataType::U32, DataType::U16, DataType::U8);
+
+    // Check data dims in update tensor and output tensor are equal
+    for (int32_t i = 0; i < data_dim; i++)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(upt_shape[i] != dst_shape[i],
+                                        "Data dims should be same size in both updates and ouput tensor.");
+    }
+
+    // Check if batch dims in indices and updates tensor are equal.
+    for (int32_t i = 0; i < ind_dims - 1; i++)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(upt_shape[data_dim + i] != ind_shape[i + 1],
+                                        "Batch dimensions should be the same in updates and indices tensor.");
+    }
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(ind_shape[1] != upt_shape[data_dim],
+                                    "Height of indices tensor should match size of highest dimension in updates tensor "
+                                    "(Excluding batch dimension)");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        data_dim >= dst_dims, "Update tensor cannot have more dims than output tensor. (Excluding batch dimensions)");
+    ARM_COMPUTE_RETURN_ERROR_ON(index_len != dst_dims - data_dim);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((ind_dims < 2), "Shape of Indices tensor must be at least 2D");
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(index_len > max_index_length, "Maximum supported index length is 5!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(index_len > dst_dims && dst_dims != 1,
+                                    "Index length should be smaller than or equal to number of output dims");
+
+    return Status{};
+}
+
+void ClScatterKernel::configure(const ClCompileContext &compile_context,
+                                const ITensorInfo      *updates,
+                                const ITensorInfo      *indices,
+                                ITensorInfo            *dst,
+                                const ScatterInfo      &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(updates, dst, indices);
+    ARM_COMPUTE_LOG_PARAMS(updates, indices, dst, info);
+
+    const TensorShape &dst_shape = dst->tensor_shape();
+    const int          index_len = indices->dimension(0);
+
+    // Check for single element data block
+    const bool is_scalar_block = (dst->num_dimensions() == static_cast<uint32_t>(index_len));
+
+    const int n0         = adjust_vec_size(16 / updates->element_size(), is_scalar_block ? 1 : updates->dimension(0));
+    const int partial_n0 = updates->dimension(0) % n0;
+
+    // The GWS will be 2D [x, y]
+    //  x-dimension refers to the x coordinate of the dst tensor
+    //  y-dimension refers to the collapsed y-coordinate of the data part of the dst tensor
+    Window win;
+
+    if (!is_scalar_block)
+    {
+        win = calculate_max_window(dst_shape, Steps(n0));
+
+        // Collapse the dimensions corresponding to indices in the execution window
+        for (int i = 0; i < index_len; ++i)
+        {
+            win.set(dst->num_dimensions() - (i + 1), Window::Dimension(0, 1, 1));
+        }
+
+        win = win.collapse(win, 1);
+    }
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
+    build_opts.add_option_if(is_data_type_float(dst->data_type()), "-DIS_FLOAT");
+
+    const int   num_dims      = dst->num_dimensions();
+    TensorShape ind_collapsed = indices->tensor_shape().collapsed_from(1);
+    build_opts.add_option("-DNUM_INDICES=" + support::cpp11::to_string(ind_collapsed[1]));
+    build_opts.add_option("-DINDEX_LENGTH=" + support::cpp11::to_string(index_len));
+
+    // We provide 5 variables to use in a constant array
+    for (int i = 1; i <= max_index_length; i++)
+    {
+        build_opts.add_option("-DOUT_SHAPE_N_MINUS_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(dst_shape[std::max(num_dims - i, 0)]));
+    }
+
+    build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_n0));
+
+    switch (info.func)
+    {
+        case ScatterFunction::Update:
+            build_opts.add_option("-DSCATTER_FUNCTION=UPDATE_OP");
+            build_opts.add_option("-DSKIP_OUTPUT_READ");
+            break;
+        case ScatterFunction::Add:
+            build_opts.add_option("-DSCATTER_FUNCTION=ADD_OP");
+            break;
+        case ScatterFunction::Sub:
+            build_opts.add_option("-DSCATTER_FUNCTION=SUB_OP");
+            break;
+        case ScatterFunction::Max:
+            build_opts.add_option("-DSCATTER_FUNCTION=MAX_OP");
+            break;
+        case ScatterFunction::Min:
+            build_opts.add_option("-DSCATTER_FUNCTION=MIN_OP");
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not implemented");
+    }
+
+    // Create kernel
+    std::string kernel_name = "scatter_mp1d_2d_mpnd";
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    ICLKernel::configure_internal(win);
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(updates->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(2));
+    _config_id += "_";
+}
+
+void ClScatterKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    const auto updates =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto indices =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    const ITensorInfo *dst_info  = dst->info();
+    const ITensorInfo *upd_info  = updates->info();
+    const int          num_dims  = dst_info->num_dimensions();
+    const int          ind_dims  = indices->info()->num_dimensions();
+    const int          index_len = indices->info()->dimension(0);
+
+    bool unsupported_padding_config =
+        num_dims == index_len && index_len > 1 && (dst_info->has_padding() || upd_info->has_padding());
+    if (unsupported_padding_config)
+    {
+        ARM_COMPUTE_ERROR("Unsupported Configuration! Padding not supported with these shapes.");
+    }
+
+    // calculate m-dimensional data block strides in updates and destination tensors
+    const int upt_block_stride =
+        updates->info()->strides_in_bytes()[updates->info()->num_dimensions() - (ind_dims - 1)];
+
+    const int out_block_stride = dst_info->strides_in_bytes()[num_dims - index_len];
+
+    unsigned int idx = 0;
+
+    add_2D_tensor_argument(idx, updates, window);
+    add_2D_tensor_argument(idx, indices, window);
+    add_2D_tensor_argument(idx, dst, window);
+
+    _kernel.setArg<cl_int>(idx++, upt_block_stride);
+    _kernel.setArg<cl_int>(idx++, out_block_stride);
+
+    enqueue(queue, *this, window, lws_hint());
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClScatterKernel.h b/src/gpu/cl/kernels/ClScatterKernel.h
new file mode 100644
index 0000000000..e1b469c88e
--- /dev/null
+++ b/src/gpu/cl/kernels/ClScatterKernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLSCATTERKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLSCATTERKERNEL_H
+
+#include "arm_compute/function_info/ScatterInfo.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+class ClScatterKernel : public IClKernel
+{
+public:
+    ClScatterKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScatterKernel);
+    /** Initialise the kernel's input and output.
+     *
+     * @note Negative indices are treated as out of bounds.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  updates         Input tensor info for the Update matrix. Data type supported: same as @p src
+     * @param[in]  indices         Input tensor info for the Indices matrix. Data type supported: S32.
+     * @param[out] dst             Output tensor info. Data type supported: same as @p src
+     * @param[in]  info            Attributes for Scatter Kernel
+     */
+    void configure(const ClCompileContext &compile_context,
+                   const ITensorInfo      *updates,
+                   const ITensorInfo      *indices,
+                   ITensorInfo            *dst,
+                   const ScatterInfo      &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClScatterKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *updates, const ITensorInfo *indices, const ITensorInfo *dst, const ScatterInfo &info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+
+#endif // ACL_SRC_GPU_CL_KERNELS_CLSCATTERKERNEL_H
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
new file mode 100644
index 0000000000..796345a923
--- /dev/null
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+ClSoftmaxKernel::ClSoftmaxKernel()
+{
+}
+
+Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(src, dst, info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( //
+        &src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type());
+    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
+                                static_cast<int32_t>(src.num_dimensions()) <= info.axis);
+
+    if (is_data_type_quantized_asymmetric(src.data_type()))
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0);
+
+        ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() !=
+                                    get_softmax_output_quantization_info(src.data_type(), info.is_log));
+    }
+
+    return Status{};
+}
+
+void ClSoftmaxKernel::configure(const CLCompileContext  &compile_context,
+                                const ITensorInfo       &src,
+                                ITensorInfo             &dst,
+                                const SoftmaxKernelInfo &info)
+{
+    ARM_COMPUTE_UNUSED(compile_context, src, dst, info);
+
+    const auto &dst_shape = dst.tensor_shape();
+
+    const auto data_type    = src.data_type();
+    const auto element_size = src.element_size();
+
+    const auto is_quantized = data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED;
+    const auto src_qinfo    = src.quantization_info().uniform();
+    const auto dst_qinfo    = dst.quantization_info().uniform();
+
+    const auto axis   = wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()));
+    const auto length = dst_shape[axis];
+
+    const auto tmp_data_type = is_quantized ? DataType::F32 : data_type;
+
+    const auto vec_size          = adjust_vec_size(16 / element_size, dst_shape[0]);
+    const auto vec_size_leftover = dst_shape[0] % vec_size;
+
+    std::string    kernel_name("softmax");
+    CLBuildOptions build_opts;
+
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+    build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length));
+    build_opts.add_option_if(info.is_log, "-DIS_LOG");
+    build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta));
+
+    build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED");
+    build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset));
+    build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale));
+    build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset));
+    build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));
+
+    if (axis == 0)
+    {
+        kernel_name += "_x";
+        build_opts.add_option("-DSOFTMAX_X");
+
+        if (is_quantized)
+        {
+            _tmp_info = TensorInfo(dst_shape, 1, tmp_data_type);
+        }
+    }
+    else
+    {
+        kernel_name += "_non_x";
+        build_opts.add_option("-DSOFTMAX_NON_X");
+
+        TensorShape tmp_shape;
+
+        tmp_shape.set(0, length * vec_size, false);
+        tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false);
+
+        for (size_t i = 2; i <= static_cast<size_t>(axis); ++i)
+        {
+            tmp_shape.set(i, dst_shape[i - 1], false);
+        }
+
+        for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i)
+        {
+            tmp_shape.set(i, dst_shape[i], false);
+        }
+
+        _tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type);
+    }
+
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window and kernel arguments.
+    Window win = calculate_max_window(src, Steps(vec_size));
+
+    bool has_collapsed = true;
+
+    win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS.
+    win = win.collapse_if_possible(win, 2, has_collapsed);
+    ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+    ICLKernel::configure_internal(win);
+
+    _axis = axis;
+
+    _config_id = "softmax_" + lower_string(string_from_data_type(data_type));
+    _config_id += "_" + std::to_string(axis);
+    _config_id += "_" + std::to_string(length);
+}
+
+void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    ICLTensor *tmp = (_tmp_info.total_size() > 0)
+                         ? utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0))
+                         : nullptr;
+
+    if (!_prepared)
+    {
+        _prepared = true;
+
+        const auto *src_info    = src->info();
+        const auto *dst_info    = dst->info();
+        auto        src_strides = src_info->strides_in_bytes();
+        auto        dst_strides = dst_info->strides_in_bytes();
+
+        const auto src_stride_axis = src_strides[_axis];
+        const auto dst_stride_axis = dst_strides[_axis];
+
+        // This axis has been removed from execution window, hence we remove it from the list of strides
+        // provided to the kernel.
+        // In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID.
+        src_strides.remove(_axis);
+        dst_strides.remove(_axis);
+
+        // Argument 0: src_ptr.
+        _kernel.setArg<cl_uint>(1, src_strides[0]);
+        _kernel.setArg<cl_uint>(2, src_strides[1]);
+        _kernel.setArg<cl_uint>(3, src_strides[2]);
+        _kernel.setArg<cl_uint>(4, src_info->offset_first_element_in_bytes());
+
+        // Argument 5: dst_ptr.
+        _kernel.setArg<cl_uint>(6, dst_strides[0]);
+        _kernel.setArg<cl_uint>(7, dst_strides[1]);
+        _kernel.setArg<cl_uint>(8, dst_strides[2]);
+        _kernel.setArg<cl_uint>(9, dst_info->offset_first_element_in_bytes());
+
+        if (tmp != nullptr)
+        {
+            const auto *tmp_info    = tmp->info();
+            const auto &tmp_strides = tmp_info->strides_in_bytes();
+
+            // Argument 10: tmp_ptr.
+            _kernel.setArg<cl_uint>(11, tmp_strides[1]);
+            _kernel.setArg<cl_uint>(12, tmp_strides[2]);
+            _kernel.setArg<cl_uint>(13, tmp_strides[3]);
+            _kernel.setArg<cl_uint>(14, 0);
+        }
+
+        if (_axis > 0)
+        {
+            _kernel.setArg<cl_uint>(15, src_stride_axis);
+            _kernel.setArg<cl_uint>(16, dst_stride_axis);
+        }
+    }
+
+    _kernel.setArg(0, src->cl_buffer());
+    _kernel.setArg(5, dst->cl_buffer());
+
+    if (tmp != nullptr)
+    {
+        _kernel.setArg(10, tmp->cl_buffer());
+    }
+
+    enqueue(queue, *this, window, lws_hint());
+}
+
+const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const
+{
+    return _tmp_info;
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h
new file mode 100644
index 0000000000..130dc7835c
--- /dev/null
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+/** The CL kernel that performs softmax function. */
+class ClSoftmaxKernel : public IClKernel
+{
+public:
+    ClSoftmaxKernel();
+
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSoftmaxKernel);
+
+    /** Check if the kernel arguments are valid.
+     *
+     * See @ref ClSoftmaxKernel::configure().
+     *
+     * @return The status.
+     */
+    static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+
+    /** Configure the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src
+     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+     */
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
+
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+    /** Get the tensor info of the temporary tensor. */
+    const TensorInfo &tmp_tensor_info() const;
+
+private:
+    bool       _prepared{false};
+    int32_t    _axis{0};
+    TensorInfo _tmp_info{};
+};
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLSOFTMAXKERNEL_H
diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp
new file mode 100644
index 0000000000..f95a215107
--- /dev/null
+++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+ClTransposeKernel::ClTransposeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClTransposeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output auto initialization if not yet initialized
+    const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
+
+    // Explicitly set the tensor shape to preserve dimensions
+    dst->set_tensor_shape(dst_shape);
+
+    ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst));
+    auto padding_info = get_padding_info({src, dst});
+
+    unsigned int vec_size_x;
+    unsigned int vec_size_y;
+
+    // Set the optimal tile size for each data type without register spilling
+    switch (src->element_size())
+    {
+        case 1:
+            vec_size_x = adjust_vec_size(8, src->dimension(0));
+            vec_size_y = adjust_vec_size(16, src->dimension(1));
+            break;
+        case 2:
+            vec_size_x = adjust_vec_size(8, src->dimension(0));
+            vec_size_y = adjust_vec_size(8, src->dimension(1));
+            break;
+        case 4:
+            vec_size_x = adjust_vec_size(4, src->dimension(0));
+            vec_size_y = adjust_vec_size(8, src->dimension(1));
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+            break;
+    }
+
+    const int vec_size_x_leftovers = src->dimension(0) % vec_size_x;
+    const int vec_size_y_leftovers = src->dimension(1) % vec_size_y;
+
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size()));
+    build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers));
+    build_opts.add_option("-DVEC_SIZE_Y=" + support::cpp11::to_string(vec_size_y));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER_Y=" + support::cpp11::to_string(vec_size_y_leftovers));
+
+    _kernel = create_kernel(compile_context, "transpose", build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y));
+    ICLKernel::configure_internal(win);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    // Validate configured dst
+    if (dst->total_size() != 0)
+    {
+        const TensorInfo dst_info =
+            src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    }
+
+    return Status{};
+}
+
+void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Collapse dimensions higher than width and height into the batch dimension
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, src, slice);
+        add_3D_tensor_argument(idx, dst, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClTransposeKernel.h b/src/gpu/cl/kernels/ClTransposeKernel.h
new file mode 100644
index 0000000000..eaad38b20f
--- /dev/null
+++ b/src/gpu/cl/kernels/ClTransposeKernel.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
+#define ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to transpose a tensor. Only the first two dimensions (width, height) are transposed. */
+class ClTransposeKernel : public IClKernel
+{
+public:
+    ClTransposeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposeKernel);
+    /** Set the src and dst of the kernel.
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             The src tensor info. Data types supported: All.
+     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClTransposeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_CLTRANSPOSEKERNEL_H
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
new file mode 100644
index 0000000000..76f39ac500
--- /dev/null
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *output,
+                          const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC);
+
+    constexpr unsigned int channel_idx = 0;
+    constexpr unsigned int width_idx   = 1;
+    constexpr unsigned int height_idx  = 2;
+    constexpr unsigned int batch_idx   = 3;
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
+
+    if (biases != nullptr)
+    {
+        if (is_data_type_quantized_asymmetric(input->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
+        }
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx),
+                                        "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
+    }
+
+    // Checks performed when output is configured
+    if (output->total_size() != 0)
+    {
+        const size_t input_width    = input->dimension(width_idx);
+        const size_t input_height   = input->dimension(height_idx);
+        const size_t weights_width  = weights->dimension(width_idx);
+        const size_t weights_height = weights->dimension(height_idx);
+
+        auto out_dims =
+            deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+        TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(output, DataLayout::NHWC);
+    }
+
+    return Status{};
+}
+} // namespace
+
+void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context,
+                                              const ITensorInfo      *input,
+                                              const ITensorInfo      *weights,
+                                              const ITensorInfo      *biases,
+                                              ITensorInfo            *output,
+                                              const PadStrideInfo    &deconv_info)
+{
+    ARM_COMPUTE_UNUSED(biases, deconv_info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+    // Perform validation
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input, weights, biases, output, deconv_info));
+
+    constexpr unsigned int channel_idx = 0;
+    constexpr unsigned int width_idx   = 1;
+    constexpr unsigned int height_idx  = 2;
+
+    const size_t input_channels  = input->dimension(channel_idx); // same as weight channels
+    const size_t input_width     = input->dimension(width_idx);
+    const size_t input_height    = input->dimension(height_idx);
+    const size_t weights_width   = weights->dimension(width_idx);
+    const size_t weights_height  = weights->dimension(height_idx);
+    const size_t output_width    = output->dimension(width_idx);
+    const size_t output_height   = output->dimension(height_idx);
+    const size_t output_channels = output->dimension(channel_idx);
+
+    // Calculate output shape
+    auto out_dims =
+        deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+    TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->quantization_info());
+
+    // Calculate updated paddings
+    // p' = k - p - 1 (k: kernel dimensions)
+    const uint32_t pad_left = weights_width - deconv_info.pad_left() - 1;
+    const uint32_t pad_top  = weights_height - deconv_info.pad_top() - 1;
+
+    // Configure kernel window
+    Window win;
+    output_shape.collapse(2U, 1U); // Collapse width and height into single dimension
+
+    const unsigned int n0               = adjust_vec_size(16 / output->element_size(), output_channels);
+    const unsigned int m0               = 1;
+    const unsigned int k0               = adjust_vec_size(16 / input->element_size(), input_channels);
+    const unsigned int partial_store_n0 = output_channels % n0;
+
+    // Create window and update padding
+    win = calculate_max_window(output_shape, Steps(n0, m0));
+    ICLKernel::configure_internal(win);
+
+    const std::string kernel_name = "transposed_convolution_nhwc";
+    CLBuildOptions    build_options;
+
+    const DataType    input_data_type = input->data_type();
+    const PaddingInfo strides         = deconv_info.stride();
+
+    if (biases != nullptr)
+    {
+        build_options.add_option(std::string("-DHAS_BIAS"));
+        build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
+    }
+
+    const auto output_data_type = output->data_type();
+
+    build_options.add_option("-cl-fast-relaxed-math");
+    build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER");
+    build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(input_data_type));
+    build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(input_channels));
+    build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+    build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+    build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(output_channels));
+    build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(output_width));
+    build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(output_height));
+    build_options.add_option("-DDST_TENSOR_TYPE=BUFFER");
+    build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(output_data_type));
+    build_options.add_option("-DWEI_TENSOR_TYPE=BUFFER");
+    build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights_width));
+    build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights_height));
+    build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
+    build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(strides.first));
+    build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(strides.second));
+    build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
+    build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
+    build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
+    build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
+    build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
+    build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
+    build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP");
+
+    if (is_data_type_quantized(output_data_type))
+    {
+        const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
+        const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
+
+        PixelValue zero_value = PixelValue(0, input->data_type(), input->quantization_info());
+        int        zero_value_s32;
+        zero_value.get(zero_value_s32);
+
+        float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
+        build_options.add_option("-DIS_QUANTIZED");
+        build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
+        build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
+        build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
+        build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
+        build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
+        build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
+        build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
+    }
+    else
+    {
+        build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input_data_type));
+        build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
+    }
+
+    if (compile_context.get_ddk_version() >= 30)
+    {
+        build_options.add_option("-fregister-allocation=64");
+    }
+
+    _kernel = create_kernel(compile_context, kernel_name, build_options.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(input_data_type));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(weights_width);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(strides.first);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(strides.second);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output_width);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(m0);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(n0);
+}
+
+Status ClTransposedConvolutionKernel::validate(const ITensorInfo   *src,
+                                               const ITensorInfo   *weights,
+                                               const ITensorInfo   *biases,
+                                               const ITensorInfo   *dst,
+                                               const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, deconv_info));
+    return Status{};
+}
+
+void ClTransposedConvolutionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    // Get initial windows
+    Window slice = window.first_slice_window_3D();
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    unsigned int idx = 0;
+    add_4d_tensor_nhwc_argument(idx, src);
+    add_4d_tensor_nhwc_argument(idx, dst);
+
+    add_4d_tensor_nhwc_argument(idx, weights);
+    if (biases != nullptr)
+    {
+        add_1D_tensor_argument(idx, biases, slice);
+    }
+
+    enqueue(queue, *this, slice, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
new file mode 100644
index 0000000000..44f6f56b7a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H
+#define ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel for transposed convolution. */
+class ClTransposedConvolutionKernel : public IClKernel
+{
+public:
+    ClTransposedConvolutionKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposedConvolutionKernel);
+    /** Set the input, weights, biases and output tensors.
+     *
+     * Similar to @ref ClTransposedConvolution::configure()
+     *
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClTransposedConvolution::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..af80c4d796
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+using namespace misc::shape_calculator;
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          unsigned int       num_groups)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
+
+    if (biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) &&
+                                    (biases->dimension(0) != input->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (input->num_dimensions() == 5) &&
+            (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
+    }
+
+    // Checks performed when output is configured
+    if (output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClWeightsReshapeKernel::ClWeightsReshapeKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
+                                       const ITensorInfo      *src,
+                                       const ITensorInfo      *biases,
+                                       ITensorInfo            *dst,
+                                       unsigned int            num_groups)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto inizialitation if not yet initialized
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups));
+    auto padding_info = get_padding_info({src, biases, dst});
+
+    const DataType data_type = src->data_type();
+
+    // Create build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type)));
+    build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
+    build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS");
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options());
+
+    // Configure window
+    Window win = calculate_max_window(*src, Steps());
+    ICLKernel::configure_internal(win);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClWeightsReshapeKernel::validate(const ITensorInfo *src,
+                                        const ITensorInfo *biases,
+                                        const ITensorInfo *dst,
+                                        unsigned int       num_groups)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups));
+    return Status{};
+}
+
+void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    auto src    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst    = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window out_window;
+    out_window.use_tensor_dimensions(dst->info()->tensor_shape());
+
+    Window in_slice  = window.first_slice_window_3D();
+    Window out_slice = out_window.first_slice_window_2D();
+
+    Window biases_window;
+    Window biases_slice;
+
+    unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor();
+    idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0;
+    _kernel.setArg<cl_uint>(idx++, src->info()->dimension(0));
+    _kernel.setArg<cl_uint>(idx++, src->info()->dimension(1));
+    _kernel.setArg<cl_uint>(idx++, src->info()->dimension(2));
+    _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3));
+    _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z());
+
+    if (biases != nullptr)
+    {
+        biases_window.use_tensor_dimensions(biases->info()->tensor_shape());
+        biases_slice = biases_window.first_slice_window_1D();
+    }
+
+    do
+    {
+        // Set arguments
+        unsigned idx = 0;
+        add_3D_tensor_argument(idx, src, in_slice);
+        add_2D_tensor_argument(idx, dst, out_slice);
+        if (biases != nullptr)
+        {
+            add_1D_tensor_argument(idx, biases, biases_slice);
+            ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
+        }
+
+        // Run kernel
+        enqueue(queue, *this, in_slice, lws_hint());
+    } while (window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
new file mode 100644
index 0000000000..5e05f8d006
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H
+#define ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer
+ *
+ * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
+ * In combination with the @ref opencl::kernels::ClIm2ColKernel can transform a convolution to a matrix multiplication.
+ *
+ * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
+ * @f[
+ * \left( \begin{array}{ccc}
+ * a000 & a001 & a002 \\
+ * a010 & a011 & a012 \\
+ * a020 & a021 & a022 \\
+ * \end{array} \right)
+ * \left( \begin{array}{ccc}
+ * a100 & a101 & a102 \\
+ * a110 & a111 & a112 \\
+ * a120 & a121 & a122 \\
+ * \end{array} \right)
+ * \rightarrow
+ * \left( \begin{array}{ccccccccc}
+ * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\
+ * \end{array} \right)
+ * @f]
+ */
+class ClWeightsReshapeKernel : public IClKernel
+{
+public:
+    ClWeightsReshapeKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWeightsReshapeKernel);
+    /** Set the input and output of the kernel.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+     *                             and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
+     * @param[in]  biases          The shared biases tensor info to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+     *                             dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
+     *                             @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
+     * @param[out] dst             The output tensor info. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
+     *                             Data types supported: Same as @p input
+     * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
+     *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   unsigned int            num_groups = 1);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClWeightsReshapeKernel::configure()
+     *
+     * @return a status
+     */
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
new file mode 100644
index 0000000000..15195025ce
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
+
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+Status
+ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
+    return Status{};
+}
+
+ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
+
+    auto padding_info = get_padding_info({src1, src2, dst});
+
+    const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
+    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
+
+    // If input have different quantization info set quantization parameters needed for the re-quantization process
+    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    {
+        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
+        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+
+    _depth        = src1->dimension(2);
+    _input1_width = src1->dimension(0);
+
+    std::string kernel_name = "concatenate_width_x2";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_width_x2_";
+    _config_id += lower_string(string_from_data_type(src1->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(1));
+}
+
+void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window slice = window.first_slice_window_4D();
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, src0, slice);
+        add_4D_tensor_argument(idx, src1, slice);
+        add_4D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_int>(idx++, _depth);
+        _kernel.setArg<cl_int>(idx++, _input1_width);
+        enqueue(queue, *this, window, lws_hint());
+    } while (window.slide_window_slice_4D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
new file mode 100644
index 0000000000..8b53d6d66b
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel of 2 tensors.
+ *  The src1 and src2 tensors will be concatenated into the dst tensor.
+ */
+class ClWidthConcatenate2TensorsKernel : public IClKernel
+{
+public:
+    ClWidthConcatenate2TensorsKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel);
+    /** Initialise the kernel's sources and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: All.
+     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1
+     * @param[out] dst             Destination tensor info. Data types supported: Same as @p src1.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClWidthConcatenate2TensorsKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    int32_t _depth{0};
+    int32_t _input1_width{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_2TENSORS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
new file mode 100644
index 0000000000..c4f84e3e45
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/utils/helpers/tensor_info.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *src3,
+                          const ITensorInfo *src4,
+                          const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) >
+                                dst->dimension(0));
+
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src3->dimension(i) != dst->dimension(i));
+        ARM_COMPUTE_RETURN_ERROR_ON(src4->dimension(i) != dst->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1,
+                                                  const ITensorInfo *src2,
+                                                  const ITensorInfo *src3,
+                                                  const ITensorInfo *src4,
+                                                  const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
+    return Status{};
+}
+
+void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *src3,
+                                                 ITensorInfo            *src4,
+                                                 ITensorInfo            *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
+
+    auto               padding_info = get_padding_info({src1, src2, src3, src4, dst});
+    const unsigned int min_dimension =
+        std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
+    const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+    build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT2_ROTATE_N=" +
+                          support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) %
+                                                    num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) +
+                                                                            src3->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
+
+    _depth        = src1->dimension(2);
+    _input1_width = src1->dimension(0);
+    _input2_width = src2->dimension(0);
+    _input3_width = src3->dimension(0);
+
+    // If soources have different quantization info set quantization parameters needed for the re-quantization process
+    const bool have_different_qinfo =
+        helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    {
+        const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
+        const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
+        const UniformQuantizationInfo iq3_info = src3->quantization_info().uniform();
+        const UniformQuantizationInfo iq4_info = src4->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info  = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale));
+        build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset));
+        build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale));
+        build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset));
+        build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale));
+        build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset));
+        build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+    }
+    std::string kernel_name = "concatenate_width_x4";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "concatenate_width_x4_";
+    _config_id += lower_string(string_from_data_type(src1->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src1->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src2->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src3->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src3->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src4->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src4->dimension(1));
+}
+
+void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
+    const auto src3 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    Window slice = window.first_slice_window_4D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, src0, slice);
+        add_4D_tensor_argument(idx, src1, slice);
+        add_4D_tensor_argument(idx, src2, slice);
+        add_4D_tensor_argument(idx, src3, slice);
+        add_4D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_int>(idx++, _depth);
+        _kernel.setArg<cl_int>(idx++, _input1_width);
+        _kernel.setArg<cl_int>(idx++, _input2_width);
+        _kernel.setArg<cl_int>(idx++, _input3_width);
+        enqueue(queue, *this, window, lws_hint());
+    } while (window.slide_window_slice_4D(slice));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
new file mode 100644
index 0000000000..f589b8ac1a
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel of 4 tensors.
+ *  All source tensors will be concatenated into the destination tensor.
+ */
+class ClWidthConcatenate4TensorsKernel : public IClKernel
+{
+public:
+    ClWidthConcatenate4TensorsKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate4TensorsKernel);
+    /** Initialise the kernel's sources and destination
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: All.
+     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1
+     * @param[in]  src3            Third source tensor info. Data types supported: same as @p src1
+     * @param[in]  src4            Fourth source tensor info. Data types supported: same as @p src1
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *src3,
+                   ITensorInfo            *src4,
+                   ITensorInfo            *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClWidthConcatenate4TensorsKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *src3,
+                           const ITensorInfo *src4,
+                           const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    int32_t _depth{0};
+    int32_t _input1_width{0};
+    int32_t _input2_width{0};
+    int32_t _input3_width{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
new file mode 100644
index 0000000000..989de4a7b7
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
+
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4);
+
+    return Status{};
+}
+} // namespace
+
+ClWidthConcatenateKernel::ClWidthConcatenateKernel()
+{
+    _type = CLKernelType::ELEMENTWISE;
+}
+
+Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst));
+    return Status{};
+}
+
+void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            width_offset,
+                                         ITensorInfo            *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
+
+    auto padding_info = get_padding_info({src, dst});
+
+    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
+
+    // Add build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
+
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    {
+        const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+        const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+
+        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset));
+        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset));
+        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale));
+        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
+    }
+    _depth                  = src->dimension(2);
+    std::string kernel_name = "concatenate_width";
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
+    // Create kernel
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win.collapse(win, Window::DimZ));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, src, window);
+    add_4D_tensor_argument(idx, dst, window);
+    _kernel.setArg<cl_uint>(idx++, _depth);
+    enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
new file mode 100644
index 0000000000..c10d6a4dc6
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H
+#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the width concatenate kernel.
+ *  The source tensor will be concatenated into the destination tensor.
+ */
+class ClWidthConcatenateKernel : public IClKernel
+{
+public:
+    ClWidthConcatenateKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenateKernel);
+    /** Initialise the kernel's source and destination
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in]     src             Source tensor info. Data types supported: All.
+     * @param[in]     width_offset    The offset on the X axis.
+     * @param[in,out] dst             Destination tensor info. Data types supported: same as @p src.
+     *
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClWidthConcatenateKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
+
+private:
+    int32_t _depth{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
new file mode 100644
index 0000000000..58c01d4da5
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+    const Size2D kernel_size      = winograd_info.kernel_size;
+    const Size2D output_tile_size = winograd_info.output_tile_size;
+
+    const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd filter transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width ||
+                                input->dimension(idx_h) != kernel_size.height);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+
+    // Checks performed when output is configured
+    if (output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(output);
+
+    const unsigned int num_elems_processed_per_iteration_x =
+        input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
+    const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
+    const unsigned int num_elems_read_per_iteration_z =
+        input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
+
+    Window win =
+        calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y,
+                                           num_elems_read_per_iteration_z));
+    Window win_collapsed = win.collapse(win, Window::DimZ);
+    return std::make_pair(Status{}, win_collapsed);
+}
+} // namespace
+
+ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel()
+{
+    _type = CLKernelType::WINOGRAD;
+}
+
+void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context,
+                                                ITensorInfo            *src,
+                                                ITensorInfo            *dst,
+                                                const WinogradInfo     &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
+    auto padding_info = get_padding_info({src, dst});
+
+    // Set build options
+    CLBuildOptions build_opts;
+
+    // For NHWC layouts pass tensor dimesions at runtime
+    if (src->data_layout() == DataLayout::NHWC)
+    {
+        _src_dim_z = src->dimension(2);
+    }
+    else
+    {
+        build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(src->dimension(2)));
+    }
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+    build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL");
+    build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL");
+    const Size2D kernel_size      = winograd_info.kernel_size;
+    const Size2D output_tile_size = winograd_info.output_tile_size;
+
+    // Create kernel
+    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second);
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
+}
+
+Status ClWinogradFilterTransformKernel::validate(const ITensorInfo  *src,
+                                                 const ITensorInfo  *dst,
+                                                 const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
+
+    return Status{};
+}
+
+void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Setup output window
+    Window window_out;
+    window_out.use_tensor_dimensions(dst->info()->tensor_shape(), 0);
+
+    unsigned int idx = 0;
+    add_4D_tensor_argument(idx, src, window);
+    add_3D_tensor_argument(idx, dst, window_out);
+    if (src->info()->data_layout() == DataLayout::NHWC)
+    {
+        _kernel.setArg<cl_uint>(idx++, _src_dim_z);
+    }
+    enqueue(queue, *this, window, lws_hint());
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
new file mode 100644
index 0000000000..6e439f0c99
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
+#define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the Winograd filter transform kernel. */
+class ClWinogradFilterTransformKernel : public IClKernel
+{
+public:
+    ClWinogradFilterTransformKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradFilterTransformKernel);
+    /** Set the input and output tensor.
+     *
+     * @note Winograd filter transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd filter transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32.
+     * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
+     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClWinogradFilterTransformKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    int32_t _src_dim_z{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
new file mode 100644
index 0000000000..54c48986fc
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd input transform only supports unit strides");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd input transform not supported");
+
+    ARM_COMPUTE_UNUSED(conv_info);
+    ARM_COMPUTE_UNUSED(output_tile_size);
+    ARM_COMPUTE_UNUSED(kernel_size);
+
+    // Validate configured output
+    if (output->total_size() != 0)
+    {
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_UNUSED(output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    bool window_changed                    = false;
+    int  num_elems_processed_per_iteration = 1;
+
+    if (input->data_layout() == DataLayout::NHWC)
+    {
+        // In the case of FP16 computation, we can perform more
+        // output feature maps in a single work-item.
+        // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to
+        // improve the performance. However, in order to make the implementation simpler,
+        // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2.
+        // Note: At the moment, only Winograd Input Transform 3x3 can support N0 != 1
+        const DataType dt   = input->data_type();
+        const size_t   dim0 = input->dimension(0);
+        const size_t   k_sz = winograd_info.kernel_size.area();
+        const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
+        if (cond)
+        {
+            if (k_sz == 3 || k_sz == 9)
+            {
+                num_elems_processed_per_iteration = 2;
+            }
+        }
+    }
+    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+
+    if (input->data_layout() == DataLayout::NCHW)
+    {
+        const PadStrideInfo conv_info        = winograd_info.convolution_info;
+        const Size2D        output_tile_size = winograd_info.output_tile_size;
+        const Size2D        kernel_size      = winograd_info.kernel_size;
+
+        unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
+        unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
+
+        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
+                                           num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+        window_changed = update_window_and_padding(win, input_access);
+    }
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+ClWinogradInputTransformKernel::ClWinogradInputTransformKernel()
+{
+    _type = CLKernelType::WINOGRAD;
+}
+
+BorderSize ClWinogradInputTransformKernel::border_size() const
+{
+    return _border_size;
+}
+
+void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context,
+                                               ITensorInfo            *src,
+                                               ITensorInfo            *dst,
+                                               const WinogradInfo     &winograd_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
+
+    auto padding_info = get_padding_info({src, dst});
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+
+    _data_layout = src->data_layout();
+
+    const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+
+    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)),
+                                                                kernel_size, output_tile_size, conv_info);
+
+    _num_tiles_x = num_tiles.width;
+    _num_tiles_y = num_tiles.height;
+
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+
+    // Output auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
+
+    ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(dst->dimension(1)));
+    const size_t total_batches = src->tensor_shape().total_size_upper(3);
+
+    // Create window and update padding
+    auto win_config = validate_and_configure_window(src, dst, winograd_info);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8));
+
+    _src_width  = src->dimension(idx_w);
+    _src_height = src->dimension(idx_h);
+
+    CLBuildOptions build_opts;
+    if (_data_layout == DataLayout::NHWC)
+    {
+        build_opts.add_option("-DNHWC");
+        build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
+        build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+        build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+        build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
+        build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
+        build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
+        build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED");
+    }
+    else
+    {
+        build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
+        build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left()));
+        build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top()));
+        build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
+        build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
+        build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL");
+        build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL");
+        build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
+    }
+
+    // Create kernel
+    std::string kernel_name =
+        "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
+
+    // Get the maximum dimension from the tile size
+    const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
+
+    // Check optimized kernel if output_dims == 2x2
+    if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
+    {
+        _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2;
+    }
+
+    // Append stepz and data layout
+    kernel_name += "_stepz";
+    kernel_name += support::cpp11::to_string(_step_z);
+    kernel_name += "_" + lower_string(string_from_data_layout(_data_layout));
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    _border_size = BorderSize(src->padding());
+
+    ARM_COMPUTE_ERROR_ON((src->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info));
+
+    _config_id = kernel_name;
+    _config_id += support::cpp11::to_string(src->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(2));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_info.pad_left());
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(conv_info.pad_top());
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(_data_layout));
+}
+
+Status ClWinogradInputTransformKernel::validate(const ITensorInfo  *src,
+                                                const ITensorInfo  *dst,
+                                                const WinogradInfo &winograd_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
+    return Status{};
+}
+
+void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
+
+    auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    const size_t idx_w         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const size_t idx_h         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const size_t idx_c         = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const size_t total_batches = window.shape().total_size_upper(3);
+
+    // Collapse window
+    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+
+    if (_data_layout == DataLayout::NHWC)
+    {
+        Window slice = window_collapsed.first_slice_window_3D();
+        slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1));
+        slice.set(2, Window::Dimension(0, total_batches, 1));
+
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, src, slice);
+        add_4D_tensor_argument(idx, dst, slice);
+        _kernel.setArg<cl_uint>(idx++, _src_width);
+        _kernel.setArg<cl_uint>(idx++, _src_height);
+        _kernel.setArg<cl_uint>(idx++, _num_tiles_x);
+        _kernel.setArg<cl_uint>(idx++, _num_tiles_y);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    else
+    {
+        Window slice = window_collapsed.first_slice_window_3D();
+        slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1));
+        slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1));
+
+        ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0);
+        slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z));
+
+        unsigned int idx = 2 * num_arguments_per_3D_tensor();
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
+        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[3]));
+
+        do
+        {
+            unsigned int idx = 0;
+            add_3D_tensor_argument(idx, src, slice);
+            add_3D_tensor_argument(idx, dst, slice);
+
+            enqueue(queue, *this, slice, lws_hint());
+        } while (window_collapsed.slide_window_slice_3D(slice));
+    }
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
new file mode 100644
index 0000000000..cebebea1d3
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
+#define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** OpenCL kernel to perform Winograd input transform.*/
+class ClWinogradInputTransformKernel : public IClKernel
+{
+public:
+    ClWinogradInputTransformKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradInputTransformKernel);
+    /** Set the input and output of the kernel.
+     *
+     * @note Winograd input transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd input transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             The input tensor info to transform. Data types supported: F16/F32
+     * @param[in] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
+     * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClWinogradInputTransformKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
+
+    // Inherited methods overridden:
+    void       run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    BorderSize border_size() const override;
+
+private:
+    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    BorderSize   _border_size{0};
+    DataLayout   _data_layout{DataLayout::UNKNOWN};
+    int          _num_tiles_x{0};
+    int          _num_tiles_y{0};
+    unsigned int _step_z{1};
+    int32_t      _src_width{0};
+    int32_t      _src_height{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H */
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
new file mode 100644
index 0000000000..89c80c55ef
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
@@ -0,0 +1,356 @@
+/*
+ * Copyright (c) 2018-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/IAccessWindow.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <cmath>
+
+using namespace arm_compute::misc::shape_calculator;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *bias,
+                          const ITensorInfo         *output,
+                          const WinogradInfo        &winograd_info,
+                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout);
+
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+    const Size2D        input_dimensions = winograd_info.input_dimensions;
+    const unsigned int  num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) *
+                                      (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout),
+        "Winograd output transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
+
+    // Compute number of elements to process in the X and Y direction
+    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
+
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
+
+    if (bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
+    }
+
+    // Checks performed when output is configured
+    if (output->total_size() != 0)
+    {
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo  *input,
+                                                        ITensorInfo  *bias,
+                                                        ITensorInfo  *output,
+                                                        const Size2D &output_tile_size)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_UNUSED(bias);
+
+    unsigned int num_elems_processed_per_iteration = 1;
+
+    if (input->data_layout() == DataLayout::NHWC)
+    {
+        // In the case of FP16 computation, we can perform more
+        // output feature maps in a single work-item.
+        // From experiments, num_elems_processed_per_iteration = 2 looks good for fp16 to
+        // improve the performance. However, in order to make the implementation simpler,
+        // we set num_elems_processed_per_iteration = 2 only when the OFMs are multiple of 2.
+        const DataType dt   = input->data_type();
+        const size_t   dim0 = input->dimension(0);
+        const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
+        if (cond)
+        {
+            num_elems_processed_per_iteration = 2;
+        }
+    }
+
+    Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
+    bool   window_changed = false;
+
+    if (output->data_layout() == DataLayout::NCHW)
+    {
+        const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
+        const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
+
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration,
+                                           num_elems_processed_per_iteration);
+        AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
+        window_changed = update_window_and_padding(win, input_access, output_access);
+    }
+
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel()
+{
+    _type = CLKernelType::WINOGRAD;
+}
+
+void ClWinogradOutputTransformKernel::configure(const ClCompileContext    &compile_context,
+                                                ITensorInfo               *src,
+                                                ITensorInfo               *bias,
+                                                ITensorInfo               *dst,
+                                                const WinogradInfo        &winograd_info,
+                                                const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
+
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(src, bias, dst, winograd_info.output_tile_size);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    IClKernel::configure_internal(win_config.second);
+
+    auto padding_info = get_padding_info({src, bias, dst});
+
+    _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
+
+    // Compute num_tiles_x
+    const Size2D        input_dimensions = winograd_info.input_dimensions;
+    const Size2D        kernel_size      = winograd_info.kernel_size;
+    const Size2D        output_tile_size = winograd_info.output_tile_size;
+    const PadStrideInfo conv_info        = winograd_info.convolution_info;
+    const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
+
+    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
+    const size_t total_batches = dst->tensor_shape().total_size_upper(3);
+
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+    build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+    build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+
+    if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
+    {
+        build_opts.add_option("-DVEC_SIZE=2");
+    }
+    else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
+    {
+        build_opts.add_option("-DVEC_SIZE=4");
+    }
+
+    _num_tiles_x = num_tiles.width;
+
+    // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
+    const GPUTarget gpu_target    = get_target();
+    const auto      act_function  = act_info.activation();
+    const auto      src_data_type = src->data_type();
+
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (src_data_type == DataType::F32 || src_data_type == DataType::F16))
+    {
+        // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
+        // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
+        build_opts.add_option("-cl-unsafe-math-optimizations");
+    }
+    else
+    {
+        build_opts.add_option("-cl-fast-relaxed-math");
+    }
+
+    if (_is_nhwc)
+    {
+        build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
+        build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
+        build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
+        build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type));
+        build_opts.add_option_if(total_batches > 1, "-DIS_BATCHED");
+        build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
+        build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
+        build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x));
+    }
+    else
+    {
+        build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
+        build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
+        build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width));
+        build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width));
+        build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src_data_type));
+        build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1)));
+        build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width)));
+        build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
+        build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2)));
+        build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL");
+        build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL");
+    }
+
+    // Storing tensor dimensions to be sent later as kernel arguments
+    _src_height = src->dimension(1);
+    _dst_width  = dst->dimension(idx_width);
+    _dst_height = dst->dimension(idx_height);
+
+    // Create kernel
+    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" +
+                              lower_string(string_from_data_layout(winograd_info.output_data_layout));
+
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name;
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_type(src_data_type));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(src->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(dst->dimension(1));
+    _config_id += "_";
+    _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout));
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
+}
+
+Status ClWinogradOutputTransformKernel::validate(const ITensorInfo         *src,
+                                                 const ITensorInfo         *bias,
+                                                 const ITensorInfo         *dst,
+                                                 const WinogradInfo        &winograd_info,
+                                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
+                                                              (bias != nullptr ? bias->clone().get() : nullptr),
+                                                              dst->clone().get(), winograd_info.output_tile_size)
+                                    .first);
+    return Status{};
+}
+
+void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
+
+    auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    // Collapse window
+    Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
+
+    // Get initial windows
+    Window slice = window_collapsed.first_slice_window_4D();
+    slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    // Setup output slice
+    Window slice_out(slice);
+    slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
+    slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    if (bias != nullptr)
+    {
+        unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
+        Window       slice_biases;
+        slice_biases.use_tensor_dimensions(bias->info()->tensor_shape());
+        add_1D_tensor_argument(idx1, bias, slice_biases);
+    }
+
+    if (_is_nhwc)
+    {
+        unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
+        _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y()));
+        _kernel.setArg<cl_int>(idx2++, _src_height);
+        _kernel.setArg<cl_int>(idx2++, _dst_width);
+        _kernel.setArg<cl_int>(idx2++, _dst_height);
+    }
+
+    do
+    {
+        unsigned int idx = 0;
+        add_4D_tensor_argument(idx, src, slice);
+        add_4D_tensor_argument(idx, dst, slice_out);
+        enqueue(queue, *this, slice, lws_hint());
+    } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+}
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
new file mode 100644
index 0000000000..65bb963061
--- /dev/null
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
+#define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/core/common/Macros.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Interface for the Winograd output transform kernel. */
+class ClWinogradOutputTransformKernel : public IClKernel
+{
+public:
+    ClWinogradOutputTransformKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradOutputTransformKernel);
+    /** Set the input and output tensor.
+     *
+     * @note Winograd output transform supports the following configurations for NCWH data layout
+     *       F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3),
+     *                                   F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     * @note Winograd output transform supports the following configurations for NHWC data layout
+     *       F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3),
+     *                                   F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5)
+     *
+     *       Strides: only unit strides
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info with shape [C, N, K, batches]. Data types supported: F16/F32.
+     * @param[in]  bias            Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p src
+     * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p src
+     * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const WinogradInfo        &winograd_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClWinogradOutputTransformKernel::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const WinogradInfo        &winograd_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+
+private:
+    using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    bool    _is_nhwc{false};
+    int32_t _src_height{0};
+    int32_t _dst_width{0};
+    int32_t _dst_height{0};
+    int32_t _num_tiles_x{0};
+};
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H */
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
new file mode 100644
index 0000000000..b5ebac3b49
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
@@ -0,0 +1,226 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include <limits>
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image)
+{
+    ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
+    ARM_COMPUTE_ERROR_ON(v0 == 0);
+    v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
+
+    if (h0 == 0)
+    {
+        // When h0 is 0, we should take the maximum H0 possible
+        h0 = std::max(n / n0, 1U);
+    }
+    else
+    {
+        h0 = std::max(std::min(static_cast<int>(n / n0), static_cast<int>(h0)), static_cast<int>(1));
+    }
+
+    const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave);
+    const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image);
+
+    return std::make_pair(lhs_info, rhs_info);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true,
+                             "The fallback GeMM configuration cannot have export_to_cl_image = true");
+
+    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
+    const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
+    const TensorInfo  tensor_reshaped_info(shape, 1, data_type);
+
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
+    {
+        return info_img;
+    }
+    else
+    {
+        return info_buf;
+    }
+}
+
+void update_padding_for_cl_image(ITensorInfo *tensor)
+{
+    constexpr unsigned int num_floats_per_pixel = 4;
+
+    const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size();
+    const unsigned int pixel_alignment      = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
+
+    ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
+    if (pixel_alignment == 0)
+    {
+        return;
+    }
+
+    const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
+    const unsigned int round_up_width =
+        ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+    const unsigned int padding = round_up_width - stride_y_in_elements;
+
+    tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
+}
+
+Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
+{
+    if (rhs_info.export_to_cl_image)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false,
+                                        "Export to cl_image only supported with n0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true,
+                                        "Export to cl_image only supported with k0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
+            "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
+                                        "Impossible to retrieve the cl_image pitch alignment");
+
+        // Check the width and height of the output tensor.
+        // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
+        const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
+        const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
+
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4,
+                                        "Not supported width for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h,
+            "Not supported height for cl_image");
+    }
+
+    return Status{};
+}
+
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0)
+{
+    ARM_COMPUTE_UNUSED(n, k, b, data_type);
+
+    const unsigned int mmul_k0 = 4;
+    best_m0                    = 4;
+    best_n0                    = 4;
+
+    const unsigned int ceil_to_multiple_m_m0             = ceil_to_multiple(m, best_m0);
+    const unsigned int m_div_m0                          = ceil_to_multiple_m_m0 / best_m0;
+    const unsigned int ceil_to_multiple_m_div_m0_mmul_k0 = ceil_to_multiple(m_div_m0, mmul_k0);
+    const unsigned int gws_y                             = ceil_to_multiple_m_div_m0_mmul_k0 / mmul_k0;
+
+    return ((k % mmul_k0) == 0) && (gws_y > 4);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    size_t min_acc = std::numeric_limits<size_t>::max();
+    size_t min_idx = 0;
+
+    ARM_COMPUTE_ERROR_ON(configs.size() == 0);
+    const size_t num_rows = configs.size();
+    const size_t num_cols = configs[0].size();
+
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, "
+                                              "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
+    ARM_COMPUTE_UNUSED(num_cols);
+
+    // Find nearest GeMM workload
+    // Note: the workload does not depend on the K dimension
+    for (size_t y = 0; y < num_rows; ++y)
+    {
+        size_t mc0 = static_cast<size_t>(configs[y][0]);
+        size_t nc0 = static_cast<size_t>(configs[y][1]);
+        size_t kc0 = static_cast<size_t>(configs[y][2]);
+        size_t bc0 = static_cast<size_t>(configs[y][3]);
+
+        size_t acc = 0;
+        acc += (m - mc0) * (m - mc0);
+        acc += (n - nc0) * (n - nc0);
+        acc += (k - kc0) * (k - kc0);
+        acc += (b - bc0) * (b - bc0);
+        acc = std::sqrt(acc);
+        if (acc < min_acc)
+        {
+            min_acc = acc;
+            min_idx = y;
+        }
+    }
+
+    // Get the configuration from the nearest GeMM shape
+    const int m0     = configs[min_idx][4];
+    const int n0     = configs[min_idx][5];
+    const int k0     = configs[min_idx][6];
+    const int v0     = configs[min_idx][7];
+    const int h0     = configs[min_idx][8];
+    const int i_lhs  = configs[min_idx][9];
+    const int i_rhs  = configs[min_idx][10];
+    const int t_lhs  = configs[min_idx][11];
+    const int t_rhs  = configs[min_idx][12];
+    const int im_rhs = configs[min_idx][13];
+
+    return configure_lhs_rhs_info(m, n, m0, n0, k0, v0, h0, i_lhs, i_rhs, t_lhs, t_rhs, im_rhs);
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
new file mode 100644
index 0000000000..84776fb207
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_HELPERS_H
+#define ARM_COMPUTE_CL_GEMM_HELPERS_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+using GeMMConfigsMatrix = std::vector<std::vector<int32_t>>;
+
+/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ *
+ * @param[in] m                  Number of rows (M) in the LHS matrix not reshaped
+ * @param[in] n                  Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] m0                 Number of rows processed by each thread/work-item
+ * @param[in] n0                 Number of columns processed by each thread/work-item
+ * @param[in] k0                 Number of inner accumulation performed by each thread/work-item
+ * @param[in] v0                 Number of vertical blocks of size (m0xk0) stored on the same output row
+ * @param[in] h0                 Number of horizontal blocks of size (k0xn0) stored on the same output row
+ * @param[in] lhs_interleave     True if the v0 (m0xk0) blocks have to be interleaved in the output row
+ * @param[in] rhs_interleave     True if the h0 (k0xn0) blocks have to be interleaved in the output row
+ * @param[in] lhs_transpose      True if the (m0xk0) block has to be transposed before been stored
+ * @param[in] rhs_transpose      True if the (k0xn0) block has to be transposed before been stored
+ * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image = false);
+
+/** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ *
+ * This function accepts two pairs of GEMMLHSMatrixInfo/GEMMRHSMatrixInfo where only the first is with cl_image2d support,
+ * and selects the valid one validating the GEMMRHSMatrixInfo. If the validation passes, the functions will return
+ * the first GEMMLHSMatrixInfo/GEMMRHSMatrixInfo pair with cl_image2d support.
+ *
+ * @param[in] info_img  GEMMLHSMatrixInfo/GEMMRHSMatrixInfo with cl_image2d support
+ * @param[in] info_buf  GEMMLHSMatrixInfo/GEMMRHSMatrixInfo to fall-back if cl_image2d cannot be used
+ * @param[in] n         Number of columns (N) in the RHS matrix not reshaped
+ * @param[in] k         Number of rows (K) in the RHS matrix not reshaped
+ * @param[in] b         Batch size
+ * @param[in] data_type Data type
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type);
+
+/** Update padding required to export the OpenCL buffer to OpenCL image2d
+ *
+ * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d
+ */
+void update_padding_for_cl_image(ITensorInfo *tensor);
+
+/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix
+ *
+ * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix
+ * @param[in] rhs_info             @ref GEMMRHSMatrixInfo
+ *
+ * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix
+ */
+Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info);
+
+/** Determine if the MMUL kernels should be preferred
+ *
+ * @param[in]      m         Number of rows of the LHS matrix
+ * @param[in]      n         Number of columns of the RHS matrix
+ * @param[in]      k         Number of columns of the LHS matrix, rows of the RHS matrix
+ * @param[in]      b         Batch size
+ * @param[in]      data_type Data type FP32/FP16
+ * @param[in, out] best_m0   Suggested M0 (number of rows of the output block) for the kernel
+ * @param[in, out] best_n0   Suggested N0 (number of columns of the output block) for the kernel
+ *
+ * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise
+ */
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0);
+
+/** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user
+ *
+ * @param[in] configs List of best configurations for a limited number of GeMM shapes
+ * @param[in] m       Number of rows of the LHS matrix
+ * @param[in] n       Number of columns of the RHS matrix
+ * @param[in] k       Number of columns of the LHS matrix, rows of the RHS matrix
+ * @param[in] b       Batch size
+ *
+ * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
+ */
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_HELPERS_H */
diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
new file mode 100644
index 0000000000..9d08633963
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H
+#define ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H
+
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Types.h"
+
+#include "src/core/common/Macros.h"
+
+#include <array>
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Basic container for the OpenCL GEMM configuration functions */
+template <class T>
+class CLGEMMConfigArray
+{
+public:
+    /** Alias for F32 index */
+    static constexpr size_t DT_F32 = 0;
+    /** Alias for F16 index */
+    static constexpr size_t DT_F16 = 1;
+    /** Alias for Int8 index */
+    static constexpr size_t DT_INT8 = 2;
+
+    /** Constructor
+     *
+     * @param[in] func_f32  Function to call for GEMM F32
+     * @param[in] func_f16  Function to call for GEMM F16
+     * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
+     *
+     */
+    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
+    {
+    }
+
+    /** Method to return the GEMM configuration function based on data type
+     *
+     * @param[in] data_type Input data type
+     *
+     * @return the valid function otherwise it returns nullptr if the data type is not valid
+     */
+    T get_function(DataType data_type)
+    {
+        switch (data_type)
+        {
+            case DataType::F32:
+                return _configs.at(DT_F32);
+            case DataType::F16:
+                return _configs.at(DT_F16);
+            case DataType::QASYMM8:
+            case DataType::QASYMM8_SIGNED:
+            case DataType::QSYMM8_PER_CHANNEL:
+                return _configs.at(DT_INT8);
+            default:
+                return nullptr;
+        }
+    }
+
+private:
+    std::array<T, 3> _configs;
+};
+
+/** Basic interface for the GEMM kernel configuration */
+class IClGemmKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] arch GPU target
+     */
+    IClGemmKernelConfig(GPUTarget arch) : _target(arch)
+    {
+    }
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig);
+    /** Virtual destructor */
+    virtual ~IClGemmKernelConfig() = default;
+    /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used
+     *
+     * @param[in] m         Number of rows LHS matrix
+     * @param[in] n         Number of columns RHS matrix
+     * @param[in] k         Number of columns LHS matrix or number of rows RHS matrix
+     * @param[in] b         Batch size
+     * @param[in] data_type Data type
+     */
+    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+
+protected:
+    GPUTarget _target;
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H */
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
new file mode 100644
index 0000000000..2f37eef31f
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+
+    switch (_target)
+    {
+        case GPUTarget::G76:
+            func = configs_G76.get_function(data_type);
+            break;
+        case GPUTarget::G71:
+            func = configs_G71.get_function(data_type);
+            break;
+        default:
+            func = configs_G7x.get_function(data_type);
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+    return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        if (n < 2048)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
+        }
+        else if (n >= 2048 && n < 8192)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 1, false, false, false, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if (m == 1)
+        {
+            if (n < 2048)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
+            }
+            else if (n >= 2048 && n < 16384)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
+            }
+        }
+    }
+    else
+    {
+        if (m == 1)
+        {
+            if (n < 8192)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+            }
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        if (n > 4196)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false);
+        }
+        else
+        {
+            if (k < 2048)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false);
+            }
+            else if (k >= 2048 && k < 16384)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 1, false, false, false, false);
+            }
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 2, 8, 2, 1, 1, false, false, false, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        if (n < 2048)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
+        }
+        else if (n >= 2048 && n < 16384)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+        }
+    }
+    else
+    {
+        if (m < 64)
+        {
+            return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
new file mode 100644
index 0000000000..f822daae53
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Bifrost based OpenCL GEMMNative configuration */
+class ClGemmDefaultConfigNativeBifrost final : public IClGemmKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClGemmDefaultConfigNativeBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H */
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
new file mode 100644
index 0000000000..f87fb1b659
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, nullptr,
+                                                                        &ClGemmDefaultConfigNativeMidgard::default_q8);
+
+    auto func = configs_default.get_function(data_type);
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+    return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    const unsigned int m0 = std::min(m, static_cast<unsigned int>(4));
+    const unsigned int n0 = std::min(n, static_cast<unsigned int>(4));
+
+    return configure_lhs_rhs_info(m, n, m0, n0, 2, 1, 1, false, false, false, false);
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
new file mode 100644
index 0000000000..fa76c5dba7
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Midgard based OpenCL GEMMNative configuration */
+class ClGemmDefaultConfigNativeMidgard final : public IClGemmKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClGemmDefaultConfigNativeMidgard(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H */
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
new file mode 100644
index 0000000000..97a1298b0a
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
+        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
+
+    auto func = configs_default.get_function(data_type);
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+    return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        if (n < 2048)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
+        }
+        else if (n >= 2048 && n < 8192)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        if (n < 2048)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
+        }
+        else if (n >= 2048 && n < 8192)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false);
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 8, 2, 1, 1, false, false, false, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if (m == 1)
+        {
+            if (n < 2048)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
+            }
+            else if (n >= 2048 && n < 16384)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+            }
+        }
+        else
+        {
+            if (m < 64)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false);
+            }
+        }
+    }
+    else
+    {
+        if (m == 1)
+        {
+            if (n < 8192)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false);
+            }
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false);
+        }
+    }
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
new file mode 100644
index 0000000000..c91b095279
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Valhall based OpenCL GEMMNative configuration */
+class ClGemmDefaultConfigNativeValhall final : public IClGemmKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClGemmDefaultConfigNativeValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H */
diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
new file mode 100644
index 0000000000..22aa1e2034
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h"
+#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** CLGEMMNative factory class */
+class ClGemmNativeKernelConfigurationFactory final
+{
+public:
+    /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMNative kernel configuration class
+     */
+    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+                return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu);
+            case GPUTarget::BIFROST:
+                return std::make_unique<ClGemmDefaultConfigNativeBifrost>(gpu);
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClGemmDefaultConfigNativeValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_NATIVE_CLGEMMNATIVEKERNELCONFIG_H
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
new file mode 100644
index 0000000000..c956c347ef
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+
+    switch (_target)
+    {
+        case GPUTarget::G76:
+            func = configs_G76.get_function(data_type);
+            break;
+        case GPUTarget::G52:
+            func = configs_G52.get_function(data_type);
+            break;
+        default:
+            func = configs_G7x.get_function(data_type);
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+    return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if (n <= 4)
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true);
+        }
+    }
+    else
+    {
+        if (n <= 4)
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    if (workload <= 274.4000f)
+    {
+        if (r_nk <= 0.7461f)
+        {
+            if (r_mn <= 21.1667f)
+            {
+                return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+            }
+        }
+        else
+        {
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+        }
+    }
+    else
+    {
+        if (r_mk <= 17.3926f)
+        {
+            if (workload <= 542.4000f)
+            {
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+            }
+        }
+        else
+        {
+            if (r_nk <= 0.5463f)
+            {
+                if (workload <= 11767.6001f)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+                }
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+    if (workload <= 323.4000f)
+    {
+        return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 8, 4, 2, 2, true, true, true, false, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    // Get lhs_info/rhs_info in case of OpenCL buffer
+    if (n <= 4)
+    {
+        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
+    }
+    else
+    {
+        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true);
+    }
+
+    // Get lhs_info/rhs_info in case of OpenCL image
+    // Condition on the GPU workload
+    if ((m / 4) * (n / 4) >= 2560)
+    {
+        // Big workload
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
+    }
+    else
+    {
+        // Small workload
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
+    }
+
+    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
+    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
+    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
+
+    // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
+    const bool use_cl_image2d = (n <= 4) ? false : true;
+
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    {
+        return std::make_pair(lhs_info_img, rhs_info_img);
+    }
+    else
+    {
+        return std::make_pair(lhs_info_buf, rhs_info_buf);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+
+    if (workload <= 1595.2000f)
+    {
+        if (r_mk <= 2.1044f)
+        {
+            if (workload <= 870.4000f)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
+            }
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false);
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false, false);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true);
+    }
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
new file mode 100644
index 0000000000..9227ec2551
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Bifrost based OpenCL GEMMReshaped configuration */
+class ClGemmDefaultConfigReshapedBifrost final : public IClGemmKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H */
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
new file mode 100644
index 0000000000..70b324eb5a
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
@@ -0,0 +1,538 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+
+    switch (_target)
+    {
+        case GPUTarget::G78:
+            func = configs_G78.get_function(data_type);
+            break;
+        case GPUTarget::G77:
+        default:
+            func = configs_G77.get_function(data_type);
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+    return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, 0, 1, 0, 1);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
+
+    if (r_mk <= 0.11824845522642136)
+    {
+        if (workload <= 880.0)
+        {
+            return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
+        }
+        else
+        {
+            if (r_nk <= 0.42521367967128754)
+            {
+                if (workload <= 1726.4000244140625)
+                {
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+            }
+            else
+            {
+                if (workload <= 1241.6000366210938)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
+                }
+            }
+        }
+    }
+    else
+    {
+        if (workload <= 11404.7998046875)
+        {
+            if (r_mk <= 1.0126488208770752)
+            {
+                if (r_mn <= 2.545312523841858)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
+                }
+            }
+            else
+            {
+                if (workload <= 2881.199951171875)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+            }
+        }
+        else
+        {
+            if (r_nk <= 0.5765306055545807)
+            {
+                if (r_mn <= 6.010416746139526)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+    if (workload <= 1288.0000f)
+    {
+        if (workload <= 505.6000f)
+        {
+            if (r_mn <= 0.4466f)
+            {
+                if (r_nk <= 0.2384f)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0);
+                }
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0);
+            }
+        }
+        else
+        {
+            if (r_mn <= 0.2250f)
+            {
+                if (r_mn <= 0.1599f)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                }
+            }
+            else
+            {
+                if (r_mk <= 0.7609f)
+                {
+                    if (r_mn <= 2.5453f)
+                    {
+                        if (workload <= 1089.6000f)
+                        {
+                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+                        }
+                        else
+                        {
+                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 2, 4, 0, 0, 1, 0, 1);
+                        }
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 2, 4, 16, 4, 4, 0, 0, 1, 0, 1);
+                    }
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+                }
+            }
+        }
+    }
+    else
+    {
+        if (workload <= 5434.4001f)
+        {
+            if (workload <= 1603.2000f)
+            {
+                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+            }
+            else
+            {
+                if (r_nk <= 0.6192f)
+                {
+                    if (r_mn <= 16.1016f)
+                    {
+                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                    }
+                    else
+                    {
+                        if (workload <= 2750.0000f)
+                        {
+                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                        }
+                        else
+                        {
+                            if (r_mk <= 6.3151f)
+                            {
+                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
+                            }
+                            else
+                            {
+                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                            }
+                        }
+                    }
+                }
+                else
+                {
+                    if (r_mk <= 0.0387f)
+                    {
+                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
+                    }
+                    else
+                    {
+                        if (r_mk <= 2.5859f)
+                        {
+                            if (r_mk <= 0.2734f)
+                            {
+                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
+                            }
+                            else
+                            {
+                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                            }
+                        }
+                        else
+                        {
+                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (r_mk <= 25.7500f)
+            {
+                if (r_mk <= 0.3615f)
+                {
+                    if (r_mn <= 0.0913f)
+                    {
+                        if (r_mk <= 0.0683f)
+                        {
+                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
+                        }
+                        else
+                        {
+                            return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
+                        }
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                    }
+                }
+                else
+                {
+                    if (workload <= 11174.3999f)
+                    {
+                        if (r_mk <= 0.8047f)
+                        {
+                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                        }
+                        else
+                        {
+                            if (workload <= 7185.5999f)
+                            {
+                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
+                            }
+                            else
+                            {
+                                return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if (workload <= 17917.5000f)
+                        {
+                            if (r_mk <= 1.5078f)
+                            {
+                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                            }
+                            else
+                            {
+                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
+                            }
+                        }
+                        else
+                        {
+                            if (workload <= 34449.6016f)
+                            {
+                                return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                            }
+                            else
+                            {
+                                return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1);
+                            }
+                        }
+                    }
+                }
+            }
+            else
+            {
+                if (r_mk <= 331.1111f)
+                {
+                    if (workload <= 53397.5996f)
+                    {
+                        if (r_mn <= 57.8063f)
+                        {
+                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                        }
+                        else
+                        {
+                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
+                        }
+                    }
+                    else
+                    {
+                        if (r_nk <= 0.9211f)
+                        {
+                            return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
+                        }
+                        else
+                        {
+                            return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
+                        }
+                    }
+                }
+                else
+                {
+                    if (workload <= 38070.4004f)
+                    {
+                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                    }
+                }
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+    if (workload <= 801.6000f)
+    {
+        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
+    }
+    else
+    {
+        if (r_mn <= 0.1211f)
+        {
+            if (workload <= 3296.0000f)
+            {
+                return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+            }
+            else
+            {
+                if (r_nk <= 1.0625f)
+                {
+                    return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1);
+                }
+            }
+        }
+        else
+        {
+            if (workload <= 5068.8000f)
+            {
+                return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
+            }
+            else
+            {
+                if (r_nk <= 0.2361f)
+                {
+                    if (workload <= 12630.0000f)
+                    {
+                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 1, 0, 0, 1, 0, 1);
+                    }
+                }
+                else
+                {
+                    if (workload <= 178790.3984f)
+                    {
+                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
+                    }
+                }
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (n <= 4)
+    {
+        return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, 0, 1, 0, 1);
+    }
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
new file mode 100644
index 0000000000..5f62efb59e
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Valhall based OpenCL GEMMReshaped configuration */
+class ClGemmDefaultConfigReshapedValhall final : public IClGemmKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClGemmDefaultConfigReshapedValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H */
diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
new file mode 100644
index 0000000000..6327ee3027
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h"
+#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** CLGEMMReshaped factory class */
+class ClGemmReshapedKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMReshaped kernel configuration class
+     */
+    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+                return std::make_unique<ClGemmDefaultConfigReshapedBifrost>(gpu);
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClGemmDefaultConfigReshapedValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_CLGEMMRESHAPEDKERNELCONFIG_H
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
new file mode 100644
index 0000000000..c4825bfbeb
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
@@ -0,0 +1,577 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu)
+    : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+    switch (_target)
+    {
+        case GPUTarget::G76:
+            func = configs_G76.get_function(data_type);
+            break;
+        case GPUTarget::G51:
+            func = configs_G51.get_function(data_type);
+            break;
+        case GPUTarget::G52:
+            func = configs_G52.get_function(data_type);
+            break;
+        case GPUTarget::G31:
+            func = configs_G31.get_function(data_type);
+            break;
+        default:
+            func = configs_G7x.get_function(data_type);
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+    return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        if (n <= 2548)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false);
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        const unsigned int h0 = std::max(n / 2, 1U);
+        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
+    }
+    else
+    {
+        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
+        if (m >= 28)
+        {
+            return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    const bool is_workload_big = ((m * n * b) / 16) >= 2048;
+
+    if (m == 1)
+    {
+        if (n >= 8192)
+        {
+            const unsigned int h0 = std::max(n / 4, 1U);
+            return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 2, 1U);
+            if (n <= 204)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false);
+            }
+        }
+    }
+    else
+    {
+        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+        if (is_workload_big)
+        {
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+        }
+    }
+
+    // Get lhs_info/rhs_info in case of OpenCL image
+    const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+    if (is_workload_big)
+    {
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+    }
+    else
+    {
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
+    }
+
+    const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
+    const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
+    const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
+
+    // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
+    const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
+
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    {
+        return std::make_pair(lhs_info_img, rhs_info_img);
+    }
+    else
+    {
+        return std::make_pair(lhs_info_buf, rhs_info_buf);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    if (m == 1)
+    {
+        if (r_nk <= 0.4664f)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
+        }
+        else
+        {
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
+
+            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+        }
+    }
+    else
+    {
+        if (workload <= 274.4000f)
+        {
+            return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
+        }
+        else
+        {
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
+
+            return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        const unsigned int n0 = n < 1280 ? 2 : 4;
+        const unsigned int h0 = std::max(n / n0, 1U);
+        return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        if (n > 2048)
+        {
+            const unsigned int h0 = std::max(n / 4, 1U);
+            return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 2, 1U);
+            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+        }
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+
+    GEMMLHSMatrixInfo lhs_info_buf;
+    GEMMRHSMatrixInfo rhs_info_buf;
+    GEMMLHSMatrixInfo lhs_info_img;
+    GEMMRHSMatrixInfo rhs_info_img;
+
+    if (m == 1)
+    {
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
+
+        if (r_mk <= 0.0026f)
+        {
+            if (r_nk <= 0.4664f)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+            }
+        }
+        else
+        {
+            if (r_mk <= 0.0148f)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
+            }
+            else
+            {
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+            }
+        }
+    }
+    else
+    {
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
+
+        if (workload <= 362.6000f)
+        {
+            return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
+        }
+        else
+        {
+            if (r_mn <= 22.6067f)
+            {
+                if (workload <= 708.8000f)
+                {
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false);
+                }
+            }
+            else
+            {
+                if (r_nk <= 0.0917f)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
+                }
+                else
+                {
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+
+    if (m == 1)
+    {
+        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
+    }
+    else
+    {
+        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+        if (workload <= 7449.60f)
+        {
+            if (workload <= 691.60f)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
+            }
+            else
+            {
+                if (workload <= 4155.20f)
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false);
+                }
+            }
+        }
+        else
+        {
+            if (workload <= 16300.80f)
+            {
+                if (r_mn <= 44.56f)
+                {
+                    GEMMLHSMatrixInfo lhs_info_buf;
+                    GEMMRHSMatrixInfo rhs_info_buf;
+                    GEMMLHSMatrixInfo lhs_info_img;
+                    GEMMRHSMatrixInfo rhs_info_img;
+
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                }
+            }
+            else
+            {
+                GEMMLHSMatrixInfo lhs_info_buf;
+                GEMMRHSMatrixInfo rhs_info_buf;
+                GEMMLHSMatrixInfo lhs_info_img;
+                GEMMRHSMatrixInfo rhs_info_img;
+
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        const unsigned int n0 = n < 1280 ? 2 : 4;
+        const unsigned int h0 = std::max(n / n0, 1U);
+        return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
+    {
+        if (m == 1)
+        {
+            const unsigned int h0 = std::max(n / 2, 1U);
+            return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 4, 1U);
+            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true);
+        }
+    }
+    else
+    {
+        const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
+        if (m == 1)
+        {
+            return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        const unsigned int h0 = std::max(n / 2, 1U);
+        return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        const unsigned int h0 = std::max(n / 2, 1U);
+        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);
+    }
+    else
+    {
+        const unsigned int h0 = std::max(n / 2, 1U);
+        return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true);
+    }
+}
+
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
new file mode 100644
index 0000000000..77c0c8d500
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */
+class ClGemmDefaultConfigReshapedRhsOnlyBifrost final : public IClGemmKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H */
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
new file mode 100644
index 0000000000..da3e2ec912
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
@@ -0,0 +1,755 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
+
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+#include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
+
+#include <utility>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu)
+    : IClGemmKernelConfig(gpu)
+{
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+{
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+
+    ConfigurationFunctionExecutorPtr func = nullptr;
+
+    switch (_target)
+    {
+        case GPUTarget::G78:
+            func = configs_G78.get_function(data_type);
+            break;
+        case GPUTarget::G710:
+        case GPUTarget::G610:
+            func = configs_G710.get_function(data_type);
+            break;
+        case GPUTarget::G715:
+        case GPUTarget::G615:
+            func = configs_G715.get_function(data_type);
+            break;
+        case GPUTarget::G77:
+        default:
+            func = configs_G77.get_function(data_type);
+            break;
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
+    return (this->*func)(m, n, k, b);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    if (m == 1)
+    {
+        const float r_mn = static_cast<float>(m) / static_cast<float>(n);
+        const float r_mk = static_cast<float>(m) / static_cast<float>(k);
+
+        if (r_mk <= 0.0064484127797186375)
+        {
+            if (r_mn <= 0.0028273810748942196)
+            {
+                GEMMLHSMatrixInfo lhs_info_buf;
+                GEMMRHSMatrixInfo rhs_info_buf;
+                GEMMLHSMatrixInfo lhs_info_img;
+                GEMMRHSMatrixInfo rhs_info_img;
+
+                const unsigned int h0                = std::max(n / 4, 1U);
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0);
+            }
+        }
+        else
+        {
+            if (r_mk <= 0.020312500186264515)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0);
+            }
+        }
+    }
+    else
+    {
+        const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+        const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+        const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+
+        if (workload <= 1999.2000122070312)
+        {
+            if (workload <= 747.1999816894531)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
+            }
+            else
+            {
+                GEMMLHSMatrixInfo lhs_info_buf;
+                GEMMRHSMatrixInfo rhs_info_buf;
+                GEMMLHSMatrixInfo lhs_info_img;
+                GEMMRHSMatrixInfo rhs_info_img;
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+            }
+        }
+        else
+        {
+            if (r_mn <= 0.03348214365541935)
+            {
+                if (r_mk <= 0.028125000186264515)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
+                }
+                else
+                {
+                    GEMMLHSMatrixInfo lhs_info_buf;
+                    GEMMRHSMatrixInfo rhs_info_buf;
+                    GEMMLHSMatrixInfo lhs_info_img;
+                    GEMMRHSMatrixInfo rhs_info_img;
+                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1);
+                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
+
+                    return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+                }
+            }
+            else
+            {
+                GEMMLHSMatrixInfo lhs_info_buf;
+                GEMMRHSMatrixInfo rhs_info_buf;
+                GEMMLHSMatrixInfo lhs_info_img;
+                GEMMRHSMatrixInfo rhs_info_img;
+                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1);
+                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
+
+                return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0},    {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},    {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0},    {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0},
+        {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                             {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0},
+                                                             {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0},
+                                                             {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},   {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},  {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},  {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+    };
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
+    };
+
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
+    const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
+
+    if (b == 1)
+    {
+        constexpr float        ratio_m_gt_n = 10.f;
+        constexpr float        ratio_n_gt_m = 0.1f;
+        constexpr unsigned int n_small_thr  = 4;
+        const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
+
+        if (m == 1)
+        {
+            // We do not need fallback in this case, as we never use cl_image for the rhs tensor
+            configs_best_to_use     = &configs_1nkb_best;
+            configs_fallback_to_use = &configs_1nkb_best;
+        }
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
+        {
+            configs_best_to_use     = &configs_mnkb_n_small_best;
+            configs_fallback_to_use = &configs_mnkb_n_small_fallback;
+        }
+        else if (ratio > ratio_m_gt_n)
+        {
+            configs_best_to_use     = &configs_mnkb_m_gt_n_best;
+            configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
+        }
+        else if (ratio < ratio_n_gt_m)
+        {
+            configs_best_to_use     = &configs_mnkb_n_gt_m_best;
+            configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
+        }
+        else
+        {
+            configs_best_to_use     = &configs_mnkb_squared_best;
+            configs_fallback_to_use = &configs_mnkb_squared_fallback;
+        }
+    }
+    else
+    {
+        configs_best_to_use     = &configs_mnkb_best_batched;
+        configs_fallback_to_use = &configs_mnkb_fallback_batched;
+    }
+
+    GEMMLHSMatrixInfo lhs_info0;
+    GEMMRHSMatrixInfo rhs_info0;
+    GEMMLHSMatrixInfo lhs_info1;
+    GEMMRHSMatrixInfo rhs_info1;
+
+    std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
+    std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
+
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    ARM_COMPUTE_UNUSED(k);
+    ARM_COMPUTE_UNUSED(b);
+
+    if (m == 1)
+    {
+        const unsigned int h0 = std::max(n / 2, 1U);
+        return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
+    }
+    else
+    {
+        const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
+        if (m >= 28)
+        {
+            return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
+        }
+        else
+        {
+            return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1);
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+
+    if (m == 1)
+    {
+        if (workload <= 278.7000f)
+        {
+            if (workload <= 7.5000f)
+            {
+                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+            }
+            else
+            {
+                if (r_mn <= 0.0031f)
+                {
+                    if (workload <= 256.6000f)
+                    {
+                        if (workload <= 16.7500f)
+                        {
+                            if (r_nk <= 1.6671f)
+                            {
+                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+                            }
+                            else
+                            {
+                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+                            }
+                        }
+                        else
+                        {
+                            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+                        }
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+                    }
+                }
+                else
+                {
+                    if (r_mk <= 0.0027f)
+                    {
+                        if (r_mk <= 0.0014f)
+                        {
+                            return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+                        }
+                        else
+                        {
+                            if (workload <= 8.9500f)
+                            {
+                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+                            }
+                            else
+                            {
+                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if (workload <= 14.1500f)
+                        {
+                            return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+                        }
+                        else
+                        {
+                            if (r_mk <= 0.0041f)
+                            {
+                                return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
+                            }
+                            else
+                            {
+                                return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            if (workload <= 363.7000f)
+            {
+                if (r_mk <= 0.0031f)
+                {
+                    return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0);
+                }
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
+            }
+        }
+    }
+    else
+    {
+        if (workload <= 1384.8000f)
+        {
+            if (workload <= 704.0000f)
+            {
+                return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
+            }
+            else
+            {
+                return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
+            }
+        }
+        else
+        {
+            if (workload <= 16761.6006f)
+            {
+                if (r_mn <= 187.1250f)
+                {
+                    return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1);
+                }
+            }
+            else
+            {
+                if (r_mk <= 432.4630f)
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1);
+                }
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
+    const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
+    const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
+    const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
+
+    if (m == 1)
+    {
+        const GeMMConfigsMatrix configs_mnkb_best = {
+            {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0},   {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0},
+            {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}};
+
+        return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b);
+    }
+    else
+    {
+        if (workload <= 1384.8000f)
+        {
+            if (r_nk <= 0.8333f)
+            {
+                if (r_mk <= 0.9119f)
+                {
+                    return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1);
+                }
+                else
+                {
+                    if (r_nk <= 0.1181f)
+                    {
+                        return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0);
+                    }
+                }
+            }
+            else
+            {
+                if (r_mk <= 1.0013f)
+                {
+                    return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1);
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 4, 8, 1, 4, 0, 1, 1, 0, 1);
+                }
+            }
+        }
+        else
+        {
+            if (workload <= 11404.7998f)
+            {
+                if (r_mk <= 2.2884f)
+                {
+                    if (r_nk <= 0.9286f)
+                    {
+                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1);
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1);
+                    }
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 5, 4, 8, 1, 4, 0, 1, 1, 0, 1);
+                }
+            }
+            else
+            {
+                if (r_nk <= 1.1926f)
+                {
+                    if (r_mn <= 1385.7917f)
+                    {
+                        return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1);
+                    }
+                    else
+                    {
+                        return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 32, 0, 1, 1, 0, 0);
+                    }
+                }
+                else
+                {
+                    return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 32, 0, 1, 1, 0, 1);
+                }
+            }
+        }
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    unsigned int best_m0;
+    unsigned int best_n0;
+
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    {
+        return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
+    }
+    else
+    {
+        return configure_G77_f32(m, n, k, b);
+    }
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},    {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},   {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
+    };
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},  {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},   {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},
+        {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0},   {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
+    };
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1},
+    };
+
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},  {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+    };
+
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1},  {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},  {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}};
+
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},  {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
+    const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
+
+    if (b == 1)
+    {
+        constexpr float        ratio_m_gt_n = 10.f;
+        constexpr float        ratio_n_gt_m = 0.1f;
+        constexpr unsigned int n_small_thr  = 4;
+        const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
+
+        if (m == 1)
+        {
+            // We do not need fallback in this case, as we never use cl_image for the rhs tensor
+            configs_best_to_use     = &configs_1nkb_best;
+            configs_fallback_to_use = &configs_1nkb_best;
+        }
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
+        {
+            configs_best_to_use     = &configs_mnkb_n_small_best;
+            configs_fallback_to_use = &configs_mnkb_n_small_best;
+        }
+        else if (ratio > ratio_m_gt_n)
+        {
+            configs_best_to_use     = &configs_mnkb_m_gt_n_best;
+            configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
+        }
+        else if (ratio < ratio_n_gt_m)
+        {
+            configs_best_to_use     = &configs_mnkb_n_gt_m_best;
+            configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
+        }
+        else
+        {
+            configs_best_to_use     = &configs_mnkb_squared_best;
+            configs_fallback_to_use = &configs_mnkb_squared_fallback;
+        }
+    }
+    else
+    {
+        configs_best_to_use     = &configs_mnkb_best_batched;
+        configs_fallback_to_use = &configs_mnkb_fallback_batched;
+    }
+
+    GEMMLHSMatrixInfo lhs_info0;
+    GEMMRHSMatrixInfo rhs_info0;
+    GEMMLHSMatrixInfo lhs_info1;
+    GEMMRHSMatrixInfo rhs_info1;
+
+    std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
+    std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
+
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
+}
+
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+{
+    unsigned int best_m0;
+    unsigned int best_n0;
+
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    {
+        return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
+    }
+    else
+    {
+        return configure_G78_f16(m, n, k, b);
+    }
+}
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
new file mode 100644
index 0000000000..a0ea337eb1
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H
+#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */
+class ClGemmDefaultConfigReshapedRhsOnlyValhall final : public IClGemmKernelConfig
+{
+public:
+    /** Constructor
+     *
+     * @param[in] gpu GPU target
+     */
+    ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu);
+
+    // Inherited overridden method
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+
+private:
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H */
diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
new file mode 100644
index 0000000000..1f0c5c2d87
--- /dev/null
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2019-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
+#define ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
+
+#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h"
+#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+namespace gemm
+{
+/** CLGEMMReshapedOnlyRHS factory class */
+class ClGemmReshapedOnlyRhsKernelConfigurationFactory final
+{
+public:
+    /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target
+     *
+     * @param[in] gpu GPU target
+     *
+     * @return CLGEMMReshapedOnlyRHS kernel configuration class
+     */
+    static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
+    {
+        switch (get_arch_from_target(gpu))
+        {
+            case GPUTarget::MIDGARD:
+            case GPUTarget::BIFROST:
+                return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyBifrost>(gpu);
+            case GPUTarget::VALHALL:
+            case GPUTarget::FIFTHGEN:
+                return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyValhall>(gpu);
+            default:
+                ARM_COMPUTE_ERROR("Not supported GPU target");
+        }
+    }
+};
+} // namespace gemm
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_KERNELS_GEMM_RESHAPED_ONLY_RHS_CLGEMMRESHAPEDONLYRHSKERNELCONFIG_H
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
new file mode 100644
index 0000000000..689a743fdf
--- /dev/null
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
+
+#include "arm_compute/core/Coordinates.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/math/Math.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
+                                    const MatMulKernelInfo &matmul_kernel_info)
+{
+    const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
+    const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_k != rhs_k, "K dimension in Lhs and Rhs matrices must match.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape.total_size() == 0, "Lhs tensor can't be empty");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
+
+    constexpr size_t batch_dim_start = 2;
+    for (size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0)
+{
+    ARM_COMPUTE_UNUSED(lhs, rhs);
+
+    const Window win = calculate_max_window(*dst, Steps(1, 1));
+
+    // Collapse along the Z direction
+    // This collapse needs to be here in order to tune the Z dimension of LWS
+    Window collapsed = win.collapse(win, Window::DimZ);
+
+    // Reconfigure window size, one arm_matrix_multiply call needs 16 threads to finish.
+    Window::Dimension x_dimension = collapsed.x();
+    Window::Dimension y_dimension = collapsed.y();
+
+    const int m = dst->dimension(1);
+    const int n = dst->dimension(0);
+
+    const int m0 = std::min(matmul_kernel_info.m0, m);
+    const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
+
+    // Make M and N multiple of M0 and N0 respectively
+    const unsigned int ceil_to_multiple_n_n0 = ceil_to_multiple(n, n0);
+    const unsigned int ceil_to_multiple_m_m0 = ceil_to_multiple(m, m0);
+
+    // Divide M and N by M0 and N0 respectively
+    const unsigned int n_div_n0 = ceil_to_multiple_n_n0 / n0;
+    const unsigned int m_div_m0 = ceil_to_multiple_m_m0 / m0;
+
+    // Make n_div_n0 and m_div_m0 multiple of mmul_n0 and mmul_m0 respectively
+    const unsigned int ceil_to_multiple_n_div_n0_mmul_n0 = ceil_to_multiple(n_div_n0, mmul_n0);
+    const unsigned int ceil_to_multiple_m_div_m0_mmul_m0 = ceil_to_multiple(m_div_m0, mmul_m0);
+
+    // Ensure x_dimension is multiple of MMUL block size (mmul_m0 * mmul_n0)
+    x_dimension.set_end(ceil_to_multiple_n_div_n0_mmul_n0 * mmul_m0);
+    y_dimension.set_end(ceil_to_multiple_m_div_m0_mmul_m0 / mmul_m0);
+
+    collapsed.set(Window::DimX, x_dimension);
+    collapsed.set(Window::DimY, y_dimension);
+
+    return std::make_pair(Status{}, collapsed);
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
new file mode 100644
index 0000000000..c2ae2a67f4
--- /dev/null
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H
+#define ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Window.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+/** Validate the input shapes of Matmul operation
+ *
+ * @param[in] lhs_shape          Lhs tensor shape
+ * @param[in] rhs_shape          Rhs tensor shape
+ * @param[in] matmul_kernel_info Matmul kernel info
+ *
+ * @return true if the shapes and matmul kernel info matches
+ */
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
+                                    const MatMulKernelInfo &matmul_kernel_info);
+
+/** Validate and configure window for Matmul MMUL kernels
+ *
+ * @param[in] lhs                Lhs tensor info
+ * @param[in] rhs                Rhs tensor info
+ * @param[in] dst                Dst tensor info
+ * @param[in] matmul_kernel_info Matmul kernel info
+ * @param[in] mmul_m0            Number of rows in the MMUL block
+ * @param[in] mmul_n0            Number of columns in the MMUL block
+ *
+ * @return a pair of Status and Window object
+ */
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0);
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute
+
+#endif // ACL_SRC_GPU_CL_KERNELS_HELPERS_MATMULKERNELHELPERS_H
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp
new file mode 100644
index 0000000000..66877ebcec
--- /dev/null
+++ b/src/gpu/cl/operators/ClActivation.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClActivation.h"
+
+#include "src/common/IOperator.h"
+#include "src/common/utils/LegacySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/ClContext.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClActivation::configure(const ClCompileContext    &compile_context,
+                             ITensorInfo               *src,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, act_info);
+    auto k = std::make_unique<kernels::ClActivationKernel>();
+    k->configure(compile_context, src, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClActivation::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+    return kernels::ClActivationKernel::validate(src, dst, act_info);
+}
+} // namespace opencl
+
+namespace gpu
+{
+namespace opencl
+{
+std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor     &src,
+                                                                 const AclTensorDescriptor     &dst,
+                                                                 const AclActivationDescriptor &act,
+                                                                 bool                           is_validate)
+{
+    TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
+    TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
+    auto       info     = detail::convert_to_activation_info(act);
+
+    if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false),
+                                                                         &dst_info.set_is_resizable(false), info)))
+    {
+        return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
+    }
+
+    auto act_op = std::make_unique<arm_compute::opencl::ClActivation>();
+    act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info);
+
+    auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
+    if (op == nullptr)
+    {
+        ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
+        return std::make_tuple(nullptr, StatusCode::OutOfMemory);
+    }
+    op->set_internal_operator(std::move(act_op));
+
+    return std::make_tuple(op, StatusCode::Success);
+}
+} // namespace opencl
+} // namespace gpu
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h
new file mode 100644
index 0000000000..4f25bb5f24
--- /dev/null
+++ b/src/gpu/cl/operators/ClActivation.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ACTIVATION_H
+#define ARM_COMPUTE_CL_ACTIVATION_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClActivationKernel */
+class ClActivation : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]  activation_info Activation layer parameters.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &activation_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClActivation::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ACTIVATION_H */
diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp
new file mode 100644
index 0000000000..b58d0df58d
--- /dev/null
+++ b/src/gpu/cl/operators/ClAdd.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClAdd.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClAdd::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
+    auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClAdd::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
+{
+    return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h
new file mode 100644
index 0000000000..7aed902f5d
--- /dev/null
+++ b/src/gpu/cl/operators/ClAdd.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ADD_H
+#define ARM_COMPUTE_CL_ADD_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run arithmetic addition
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic addition between two tensors.
+ */
+class ClAdd : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * Valid configurations (src1,src2) -> dst :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClAdd::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ADD_H */
diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
new file mode 100644
index 0000000000..8f26ef003d
--- /dev/null
+++ b/src/gpu/cl/operators/ClCast.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCast.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCast::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       ConvertPolicy           policy)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
+    auto k = std::make_unique<kernels::ClCastKernel>();
+    k->configure(compile_context, src, dst, policy);
+    _kernel = std::move(k);
+}
+
+Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    return kernels::ClCastKernel::validate(src, dst, policy);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h
new file mode 100644
index 0000000000..25d2293673
--- /dev/null
+++ b/src/gpu/cl/operators/ClCast.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CAST_H
+#define ARM_COMPUTE_CL_CAST_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCastKernel */
+class ClCast : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @note Input data type must be different than output data type.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst                                    |
+     * |:--------------|:--------------------------------------|
+     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
+     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
+     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
+     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
+     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
+     * |F16            | U8, S8, U16, S16, U32, F32            |
+     * |F32            | U8, S8, U16, S16, U32, F16            |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  policy          Conversion policy.
+     */
+    void
+    configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCast::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CAST_H */
diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp
new file mode 100644
index 0000000000..31018b9768
--- /dev/null
+++ b/src/gpu/cl/operators/ClConcatenate.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClConcatenate::configure(const CLCompileContext           &compile_context,
+                              const std::vector<ITensorInfo *> &src_vector,
+                              ITensorInfo                      *dst,
+                              size_t                            axis)
+{
+    ARM_COMPUTE_ERROR_ON(dst == nullptr);
+    ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis);
+    _axis       = axis;
+    _num_inputs = src_vector.size();
+
+    TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+    std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
+    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(),
+                   [](ITensorInfo *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t;
+                   });
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
+
+    unsigned int offset = 0;
+    switch (_axis)
+    {
+        case Window::DimX:
+        {
+            switch (_num_inputs)
+            {
+                case 2:
+                {
+                    // Configure WidthConcatenate2Tensors kernel
+                    auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>();
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                case 4:
+                {
+                    // Configure WidthConcatenate4Tensors kernel
+                    auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2),
+                                      src_vector.at(3), dst);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                default:
+                {
+                    // Configure generic case WidthConcatenate kernels
+                    for (unsigned int i = 0; i < _num_inputs; ++i)
+                    {
+                        auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
+                        kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                        offset += src_vector.at(i)->dimension(_axis);
+                        _concat_kernels.emplace_back(std::move(kernel));
+                    }
+                    break;
+                }
+            }
+            break;
+        }
+        case Window::DimY:
+        {
+            for (unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for (unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case 3:
+        {
+            for (unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+}
+
+Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr);
+    const unsigned int num_inputs = src_vector.size();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+    unsigned int offset = 0;
+    switch (axis)
+    {
+        case Window::DimX:
+        {
+            switch (num_inputs)
+            {
+                case 2:
+                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(
+                        kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+                    break;
+                case 4:
+                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(
+                        src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+                    break;
+                default:
+                    // Validate generic case of WidthConcatenate kernel
+                    for (const auto &src : src_vector)
+                    {
+                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+                        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
+                        offset += src->dimension(axis);
+                    }
+                    break;
+            }
+            break;
+        }
+        case Window::DimY:
+        {
+            for (const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for (const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        case 3:
+        {
+            for (const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+
+    if (dst->total_size() != 0)
+    {
+        TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
+    }
+
+    return Status{};
+}
+
+void ClConcatenate::run(ITensorPack &tensors)
+{
+    if (tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    {
+        ARM_COMPUTE_ERROR("Configured with different number of inputs");
+    }
+
+    if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    {
+        ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
+        CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
+    }
+    else
+    {
+        int i = 0;
+        for (auto &k : _concat_kernels)
+        {
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+            pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+            CLScheduler::get().enqueue_op(*k, pack, true);
+            ++i;
+        }
+    }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h
new file mode 100644
index 0000000000..d8ce9d2a5c
--- /dev/null
+++ b/src/gpu/cl/operators/ClConcatenate.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONCATENATE_H
+#define ARM_COMPUTE_CLCONCATENATE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
+ * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
+ * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
+ * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
+ */
+class ClConcatenate : public IClOperator
+{
+public:
+    ClConcatenate() = default;
+    /** Initialise the kernel's inputs vector and dst.
+     *
+     * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
+     *       @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
+     *
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] src_vector      The vectors containing all the tensors info to concatenate. Data types supported: All
+     * @param[out]    dst             Destination tensor info. Data types supported: same as @p src_vector.
+     * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     */
+    void configure(const ClCompileContext           &compile_context,
+                   const std::vector<ITensorInfo *> &src_vector,
+                   ITensorInfo                      *dst,
+                   size_t                            axis);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClConcatenate::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::vector<std::unique_ptr<IClKernel>> _concat_kernels{};
+    unsigned int                            _num_inputs{0};
+    unsigned int                            _axis{0};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CONCATENATE_H */
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
new file mode 100644
index 0000000000..2c3b0214fa
--- /dev/null
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
+
+#include <memory>
+
+namespace
+{
+/** Get the suitable kernel size for using direct convolution method with NHWC data layout.
+ *
+ * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function
+ *
+ * @param[in] gpu_target GPU target
+ *
+ * @return the suitable kernel size for using direct convolution method with NHWC data layout
+ */
+size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
+{
+    switch (gpu_target)
+    {
+        case arm_compute::GPUTarget::G76:
+        case arm_compute::GPUTarget::G77:
+        case arm_compute::GPUTarget::G78:
+            return 5;
+        case arm_compute::GPUTarget::G71:
+        case arm_compute::GPUTarget::G72:
+        case arm_compute::GPUTarget::MIDGARD:
+        case arm_compute::GPUTarget::BIFROST:
+            return 7;
+        default:
+            return 5;
+    }
+}
+} // namespace
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClConv2d::ClConv2d() : _operator()
+{
+}
+
+ClConv2d::~ClConv2d() = default;
+
+void ClConv2d::configure(const CLCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *weights,
+                         ITensorInfo            *biases,
+                         ITensorInfo            *dst,
+                         const Conv2dInfo       &conv2d_info,
+                         const WeightsInfo      &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
+
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
+    {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
+            auto f = std::make_unique<ClWinogradConv2d>();
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info,
+                         conv2d_info.enable_fast_math);
+            _operator = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::DIRECT:
+        {
+            ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
+            auto f = std::make_unique<ClDirectConv2d>();
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
+            _operator = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::INDIRECT:
+        {
+            ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
+            auto f = std::make_unique<ClIndirectConv2d>();
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
+            _operator = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::GEMM:
+        {
+            auto f = std::make_unique<ClGemmConv2d>();
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info, weights_info);
+            _operator = std::move(f);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+    _aux_mem = _operator->workspace();
+}
+
+Status ClConv2d::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv2dInfo  &conv2d_info,
+                          const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
+
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
+    {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            //Validate Winograd
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info,
+                                                                   conv2d_info.act_info, conv2d_info.enable_fast_math));
+            break;
+        }
+        case ConvolutionMethod::DIRECT:
+        {
+            // Validate direct convolution layer
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            break;
+        }
+        case ConvolutionMethod::INDIRECT:
+        {
+            // Validate indirect convolution layer
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            break;
+        }
+        case ConvolutionMethod::GEMM:
+        {
+            // Validate gemm-based convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+
+    return Status{};
+}
+
+ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src,
+                                                   const ITensorInfo *weights,
+                                                   const ITensorInfo *dst,
+                                                   const Conv2dInfo  &conv2d_info,
+                                                   const WeightsInfo &weights_info,
+                                                   const GPUTarget    gpu_target)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
+    ARM_COMPUTE_UNUSED(weights_info);
+
+    const PadStrideInfo       conv_info        = conv2d_info.conv_info;
+    const ActivationLayerInfo act_info         = conv2d_info.act_info;
+    const Size2D              dilation         = conv2d_info.dilation;
+    bool                      enable_fast_math = conv2d_info.enable_fast_math;
+
+    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+
+    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
+    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+    const std::vector<ConfigurationMethod> known_configs = {
+        // Alexnet
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
+        // VGG16 / VGG19
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
+    };
+
+    const auto find_config = [&](ConfigurationMethod c)
+    {
+        const ConvolutionConfiguration config      = c.first;
+        const PadStrideInfo            info        = std::get<3>(config);
+        const DataLayout               data_layout = std::get<4>(config);
+
+        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride() && (data_layout == src->data_layout());
+    };
+
+    std::vector<ConfigurationMethod>::const_iterator found;
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    {
+        return (*found).second;
+    }
+
+    if (dilation != Size2D(1U, 1U))
+    {
+        return ConvolutionMethod::GEMM;
+    }
+    else
+    {
+        if (src->data_layout() == DataLayout::NCHW)
+        {
+            // SRGAN
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) &&
+                (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
+            {
+                return ConvolutionMethod::DIRECT;
+            }
+            if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) &&
+                (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
+            {
+                return ConvolutionMethod::FFT;
+            }
+            if (src->dimension(idx_c) < 16)
+            {
+                return ConvolutionMethod::GEMM;
+            }
+            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))
+                       ? ConvolutionMethod::WINOGRAD
+                       : ConvolutionMethod::GEMM;
+        }
+        else
+        {
+            const bool is_direct_valid =
+                bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+            const bool is_wino_valid =
+                bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
+            const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
+
+            // SRGAN case
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) && is_direct_valid)
+            {
+                return ConvolutionMethod::DIRECT;
+            }
+
+            // Floating-point case: GeMM/Direct/Winograd
+            if (is_data_type_float(src->data_type()))
+            {
+                // Get dst shape
+                TensorShape output_shape =
+                    misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) &&
+                                                (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool is_ifm_ge_8       = src->dimension(idx_c) >= 8;
+                const bool is_ifm_ge_16      = src->dimension(idx_c) >= 16;
+                const bool is_ofm_lte_8      = weights->dimension(3U) <= 8;
+                const bool is_ofm_lt_64      = weights->dimension(3U) < 64;
+                const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool is_ifm_gt_ofm     = src->dimension(idx_c) > weights->dimension(3U);
+                const bool is_m_one          = output_shape[1] * output_shape[2] == 1;
+                const bool is_unit_stride =
+                    (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+                const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
+
+                // Run Winograd if valid and IFM >= 8
+                if (is_wino_valid && is_ifm_ge_8)
+                {
+                    if (is_ofm_lte_8)
+                    {
+                        if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                            get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
+                        {
+                            return ConvolutionMethod::WINOGRAD;
+                        }
+                    }
+                    else
+                    {
+                        return ConvolutionMethod::WINOGRAD;
+                    }
+                }
+
+                // Direct convolution case
+                if (is_direct_valid)
+                {
+                    if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                         get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
+                    {
+                        if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+                        {
+                            return ConvolutionMethod::DIRECT;
+                        }
+                    }
+                    else if (gpu_target == arm_compute::GPUTarget::G76)
+                    {
+                        if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+                        {
+                            return ConvolutionMethod::DIRECT;
+                        }
+                    }
+                    else
+                    {
+                        ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT;
+
+                        const bool is_indirect_valid =
+                            bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+
+                        // indirect conv2d should be called when:
+                        // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81)
+                        // 2- When the kernel size is odd
+                        // 3- When the Gpu target is Arm Mali-G77
+                        if (is_indirect_valid)
+                        {
+                            const bool is_kernel_sz_odd = kernel_sz % 2;
+                            const bool is_g77           = gpu_target == GPUTarget::G77;
+                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77
+                                                        ? ConvolutionMethod::INDIRECT
+                                                        : ConvolutionMethod::DIRECT;
+                        }
+
+                        // Direct/indirect convolution used for the first layer of the network
+                        if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
+                        {
+                            // In general, the question we should ask for the first convolution layer of a model is:
+                            // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that
+                            // when OFM is big enough, the contribution of im2col is small and the GEMM approach is preferable.
+                            // From internal experiments, the OFM threshold is 64 (is_ofm_lt_64)
+                            return preferred_conv_method;
+                        }
+
+                        if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
+                        {
+                            return preferred_conv_method;
+                        }
+
+                        // Direct convolution used for the last layer of the network
+                        if (is_ofm_lte_8)
+                        {
+                            return preferred_conv_method;
+                        }
+                    }
+                }
+
+                // Default case
+                return ConvolutionMethod::GEMM;
+            }
+
+            // Generic case for quantized. Only GeMM
+            return ConvolutionMethod::GEMM;
+        }
+    }
+}
+
+void ClConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+    _operator->run(tensors);
+}
+
+void ClConv2d::prepare(ITensorPack &tensors)
+{
+    _operator->prepare(tensors);
+}
+
+experimental::MemoryRequirements ClConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h
new file mode 100644
index 0000000000..0cf3cbc1ce
--- /dev/null
+++ b/src/gpu/cl/operators/ClConv2d.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONV2D_H
+#define ARM_COMPUTE_CLCONV2D_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref opencl::ClGemmConv2d
+ * -# @ref opencl::ClWinogradConv2d
+ * -# @ref opencl::ClIndirectConv2d
+ * -# @ref opencl::ClDirectConv2d
+ * -# @ref CLFFTConvolutionLayer
+ *
+ * The function selects one of the algorithms mentioned above based on:
+ *      - The size of the kernel
+ *      - Number of src/dst feature maps
+ *      - Amount of memory needed
+ *
+ * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed.
+ *
+ * FP32 Algorithm| Filter Size                                                 |   Input/Output feature maps               |
+ * --------------|-------------------------------------------------------------|-------------------------------------------|
+ * Winograd      | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7                 |  Input channels is greater than 3         |
+ * FFT           | Squared kernels and greater than 9x9                        |  Input feature maps > Output feature maps |
+ * DirectConv    | 9x9                                                         |                                           |
+ * GEMM          | Any size                                                    |                                           |
+ *
+ * Winograd 5x5 requires fast maths enabled.
+ *
+ * FP16 Algorithm| Filter Size                |   Input/Output feature maps               |
+ * --------------|----------------------------|-------------------------------------------|
+ * Winograd      | 3x3 1x3 3x1 5x1 1x5 5x5    |  Input channels is greater than 3         |
+ * FFT           | Not supported              |                                           |
+ * DirectConv    | 9x9                        |                                           |
+ * GEMM          | Any size                   |                                           |
+ *
+ * Winograd FP16 requires fast maths enabled.
+ *
+ */
+class ClConv2d : public IClOperator
+{
+public:
+    /** Default constructor */
+    ClConv2d();
+    /** Default Destructor */
+    ~ClConv2d();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClConv2d(const ClConv2d &) = delete;
+    /** Default move constructor */
+    ClConv2d(ClConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClConv2d &operator=(const ClConv2d &) = delete;
+    /** Default move assignment operator */
+    ClConv2d &operator=(ClConv2d &&) = default;
+    /** Set the src and dst tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. 3 lower dimensions represent a single src [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of srcs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Same as @p src, except for src of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst             Destination tensor info. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+     *                             Data types supported: Same as @p src.
+     * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
+     * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d
+     *
+     * Similar to ClConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &conv2d_info,
+                           const WeightsInfo &weights_info = WeightsInfo());
+    /** Static function to check if given info will return the convolution called by @ref ClConv2d
+     *
+     * @param[in] src          Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
+     *                         while every optional dimension from 4 and above represent a batch of srcs.
+     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                         Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] dst          Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+     *                         Data types supported: Same as @p src.
+     * @param[in] conv2d_info  Contains convolution 2d info described in @ref Conv2dInfo.
+     * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel.
+     * @param[in] gpu_target   Specifies the @p GPUTarget.
+     *
+     * @return the Convolution Method Hint
+     */
+    static ConvolutionMethod get_convolution_method(const ITensorInfo *src,
+                                                    const ITensorInfo *weights,
+                                                    const ITensorInfo *dst,
+                                                    const Conv2dInfo  &conv2d_info,
+                                                    const WeightsInfo &weights_info,
+                                                    const GPUTarget    gpu_target);
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    std::unique_ptr<IClOperator>     _operator;
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCONV2D_H */
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000000..cf24c68d21
--- /dev/null
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context,
+                                               const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const TensorShape      &original_src_shape,
+                                               DataLayout              data_layout)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
+    auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
+    k->configure(compile_context, src, dst, original_src_shape, data_layout);
+    _kernel = std::move(k);
+}
+
+Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                const ITensorInfo *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
+{
+    return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
new file mode 100644
index 0000000000..c46152081c
--- /dev/null
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H
+#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClConvertFullyConnectedWeightsKernel */
+class ClConvertFullyConnectedWeights : public IClOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in] compile_context    The compile context to be used.
+     * @param[in] src                The src tensor info. Data types supported: All.
+     * @param[in] dst                The dst tensor info. Data types supported: Same as @p src
+     * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
+     * @param[in] data_layout        The data layout the weights have been trained in.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClConvertFullyConnectedWeights::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H */
diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp
new file mode 100644
index 0000000000..e2be7cebd4
--- /dev/null
+++ b/src/gpu/cl/operators/ClCopy.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCopy.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCopyKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, dst_window);
+    auto k = std::make_unique<kernels::ClCopyKernel>();
+    k->configure(compile_context, src, dst, dst_window);
+    _kernel = std::move(k);
+}
+
+Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window)
+{
+    return kernels::ClCopyKernel::validate(src, dst, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h
new file mode 100644
index 0000000000..fe9b58c607
--- /dev/null
+++ b/src/gpu/cl/operators/ClCopy.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COPY_H
+#define ARM_COMPUTE_CL_COPY_H
+
+#include "arm_compute/core/Window.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCopyKernel */
+class ClCopy : public IClOperator
+{
+public:
+    /** Initialise the function's source and destination.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: All.
+     * @param[out] dst             Output tensor info. Data types supported: Same as @p src.
+     * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
+     *
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCopy::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_COPY_H */
diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp
new file mode 100644
index 0000000000..6313e4fbb5
--- /dev/null
+++ b/src/gpu/cl/operators/ClCrop.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCrop.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCropKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCrop::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
+    auto k = std::make_unique<kernels::ClCropKernel>();
+    k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window);
+    _kernel = std::move(k);
+}
+
+Status ClCrop::validate(const ITensorInfo *src,
+                        const ITensorInfo *dst,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
+{
+    return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h
new file mode 100644
index 0000000000..e845cf372c
--- /dev/null
+++ b/src/gpu/cl/operators/ClCrop.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CROP_H
+#define ARM_COMPUTE_CL_CROP_H
+
+#include "arm_compute/core/Window.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCropKernel */
+class ClCrop : public IClOperator
+{
+public:
+    /** Initialise the function's source and destination.
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context     The compile context to be used.
+     * @param[in]  src                 Source tensor info. Data type supported: All. Data layouts supported: NHWC.
+     * @param[out] dst                 Destination tensor info. Data type supported: F32
+     * @param[in]  start               Coordinates of where to start cropping the image.
+     * @param[in]  end                 Coordinates of where to end cropping the image.
+     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p src.
+     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCrop::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CROP_H */
diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp
new file mode 100644
index 0000000000..eb6f9e7abb
--- /dev/null
+++ b/src/gpu/cl/operators/ClDequantize.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClDequantize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClDequantizeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClDequantizeKernel::validate(src, dst);
+}
+
+void ClDequantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDequantize.h b/src/gpu/cl/operators/ClDequantize.h
new file mode 100644
index 0000000000..ccaac2cd49
--- /dev/null
+++ b/src/gpu/cl/operators/ClDequantize.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H
+#define ARM_COMPUTE_CL_DEQUANTIZE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */
+class ClDequantize : public IClOperator
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst             Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClDequantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */
diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
new file mode 100644
index 0000000000..17a196ce6b
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+#include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+using namespace arm_compute::cl_direct_conv;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace
+{
+ITensorPack select_activation_src_dst(ITensorPack &tensors)
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
+    pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
+    return pack;
+}
+
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+{
+    // Get GPU target
+    GPUTarget gpu_target = CLScheduler::get().target();
+
+    std::unique_ptr<IClDirectConvKernelConfig> t = ClDirectConvKernelConfigurationFactory::create(gpu_target);
+
+    return t->configure(src, weights, conv_info);
+}
+
+} // namespace
+
+void ClDirectConv2d::configure(const CLCompileContext    &compile_context,
+                               ITensorInfo               *src,
+                               ITensorInfo               *weights,
+                               ITensorInfo               *biases,
+                               ITensorInfo               *dst,
+                               const PadStrideInfo       &conv_info,
+                               const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
+
+    // Initialize the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
+
+    // Configure direct convolution kernel
+    const ActivationLayerInfo conv2d_act_info =
+        (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info
+                                                                                         : ActivationLayerInfo();
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc);
+    _direct_conv_kernel = std::move(k);
+
+    // Configure border handler
+    PixelValue zero_value(0.f);
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+    }
+    auto b = std::make_unique<CLFillBorderKernel>();
+    b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _src_border_handler = std::move(b);
+
+    // Fused activation is currently supported for NHWC and floating point types
+    if (act_info.enabled() && !conv2d_act_info.enabled())
+    {
+        auto a = std::make_unique<kernels::ClActivationKernel>();
+        a->configure(compile_context, dst, dst, act_info);
+        _activation_kernel = std::move(a);
+    }
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
+}
+
+Status ClDirectConv2d::validate(const ITensorInfo         *src,
+                                const ITensorInfo         *weights,
+                                const ITensorInfo         *biases,
+                                const ITensorInfo         *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
+{
+    // Initialize the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
+    if (act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
+    }
+    return Status{};
+}
+
+void ClDirectConv2d::run(ITensorPack &tensors)
+{
+    // Run border handler
+    CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
+    // Run direct convolution
+    CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
+    // Run activation kernel
+    if (_activation_kernel)
+    {
+        auto act_pack = select_activation_src_dst(tensors);
+        CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
+    }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h
new file mode 100644
index 0000000000..0f18490814
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv2d.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref opencl::ClDirectConv2d
+ */
+class ClDirectConv2d : public IClOperator
+{
+public:
+    ClDirectConv2d() = default;
+    /** Set the src and dst tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of srcs.
+     *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
+     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+     *                             Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     *
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel> _src_border_handler{nullptr};
+    std::unique_ptr<IClKernel> _activation_kernel{nullptr};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */
diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp
new file mode 100644
index 0000000000..b08347936b
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv3d.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClDirectConv3d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/gpu/cl/kernels/ClDirectConv3dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClDirectConv3d::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *src0,
+                               const ITensorInfo      *src1,
+                               const ITensorInfo      *src2,
+                               ITensorInfo            *dst,
+                               const Conv3dInfo       &conv3d_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0);
+
+    // Configure direct convolution 3d kernel
+    auto k = std::make_unique<kernels::ClDirectConv3dKernel>();
+    k->configure(compile_context, src0, src1, src2, dst, conv3d_info);
+    _direct_conv3d_kernel = std::move(k);
+}
+
+Status ClDirectConv3d::validate(const ITensorInfo *src0,
+                                const ITensorInfo *src1,
+                                const ITensorInfo *src2,
+                                const ITensorInfo *dst,
+                                const Conv3dInfo  &conv3d_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info));
+    return Status{};
+}
+
+void ClDirectConv3d::run(ITensorPack &tensors)
+{
+    // Run direct convolution 3d
+    CLScheduler::get().enqueue_op(*_direct_conv3d_kernel.get(), tensors, true);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h
new file mode 100644
index 0000000000..5fb32460e2
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv3d.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV3D_H
+#define ARM_COMPUTE_CL_DIRECT_CONV3D_H
+
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+struct Conv3dInfo;
+class IClKernel;
+
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer with 3 spatial dimensions. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::ClDirectConv3d
+ */
+class ClDirectConv3d : public IClOperator
+{
+public:
+    ClDirectConv3d() = default;
+    /** Set the src and dst tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     * |QASYMM8        |QASYMM8        |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
+     *                             while every optional dimension from 5 and above represent a batch of srcs.
+     * @param[in]  src1            Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_w, kernel_h, kernel_d].
+     * @param[in]  src2            Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     * @param[out] dst             Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
+     * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
+     *
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv3d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp
new file mode 100644
index 0000000000..1325371d19
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClElementwiseOperations.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClElementwiseDivision::configure(const ClCompileContext    &compile_context,
+                                      ITensorInfo               *src1,
+                                      ITensorInfo               *src2,
+                                      ITensorInfo               *dst,
+                                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwiseDivision::validate(const ITensorInfo         *src1,
+                                       const ITensorInfo         *src2,
+                                       const ITensorInfo         *dst,
+                                       const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
+}
+
+void ClElementwiseMax::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::MAX, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwiseMax::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
+}
+
+void ClElementwiseMin::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::MIN, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwiseMin::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
+}
+
+void ClElementwiseSquaredDiff::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *src1,
+                                         ITensorInfo               *src2,
+                                         ITensorInfo               *dst,
+                                         const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwiseSquaredDiff::validate(const ITensorInfo         *src1,
+                                          const ITensorInfo         *src2,
+                                          const ITensorInfo         *dst,
+                                          const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
+}
+
+void ClElementwisePower::configure(const ClCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::POWER, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwisePower::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h
new file mode 100644
index 0000000000..de7c018d75
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseOperations.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for division
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an arithmetic division between two tensors.
+ */
+class ClElementwiseDivision : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: F16/F32.
+     * @param[in]  src2            Second source tensor info. same as @p src1.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwiseDivision::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class ClElementwiseMax : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwiseMax::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class ClElementwiseMin : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwiseMin::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
+ * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
+ */
+class ClElementwiseSquaredDiff : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwiseSquaredDiff::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ */
+class ClElementwisePower : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: F16/F32.
+     * @param[in]  src2            Second source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported:F16/F32.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwisePower::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H */
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp
new file mode 100644
index 0000000000..914621183e
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClElementwiseUnary.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT);
+    _kernel = std::move(k);
+}
+
+Status ClRsqrt::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::RSQRT);
+}
+
+void ClExp::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::EXP);
+    _kernel = std::move(k);
+}
+
+Status ClExp::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::EXP);
+}
+
+void ClNeg::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::NEG);
+    _kernel = std::move(k);
+}
+
+Status ClNeg::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::NEG);
+}
+
+void ClSin::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::SIN);
+    _kernel = std::move(k);
+}
+
+Status ClSin::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::SIN);
+}
+
+void ClAbs::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::ABS);
+    _kernel = std::move(k);
+}
+
+Status ClAbs::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ABS);
+}
+
+void ClLog::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::LOG);
+    _kernel = std::move(k);
+}
+
+Status ClLog::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOG);
+}
+
+void ClRound::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::ROUND);
+    _kernel = std::move(k);
+}
+
+Status ClRound::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ROUND);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.h b/src/gpu/cl/operators/ClElementwiseUnary.h
new file mode 100644
index 0000000000..a23b789ab5
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseUnary.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to perform inverse square root on an src tensor. */
+class ClRsqrt : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClRsqrt::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to perform exponential on an src tensor. */
+class ClExp : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClExp::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to negate an src tensor. */
+class ClNeg : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClNeg::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to calculate sine of an src tensor. */
+class ClSin : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClSin::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to perform elementwise log on an src tensor. */
+class ClLog : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClLog::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to get the absolute value of an src tensor. */
+class ClAbs : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClAbs::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to get the round (to the nearest even) value of an src tensor. */
+class ClRound : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClRound::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H */
diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp
new file mode 100644
index 0000000000..817b15ab20
--- /dev/null
+++ b/src/gpu/cl/operators/ClFill.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFill.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClFillKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFill::configure(const ClCompileContext &compile_context,
+                       ITensorInfo            *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
+{
+    ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window);
+    auto k = std::make_unique<kernels::ClFillKernel>();
+    k->configure(compile_context, tensor, constant_value, dst_window);
+    _kernel = std::move(k);
+}
+
+Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
+{
+    return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h
new file mode 100644
index 0000000000..e13862aa6b
--- /dev/null
+++ b/src/gpu/cl/operators/ClFill.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FILL_H
+#define ARM_COMPUTE_CL_FILL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClFillKernel */
+class ClFill : public IClOperator
+{
+public:
+    /** Initialise the kernel's tensor and filling value
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] tensor          Source tensor info. Supported data types: All.
+     * @param[in]     constant_value  The value used to fill the planes of the tensor
+     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClFill::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FILL_H */
diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp
new file mode 100644
index 0000000000..7532532c94
--- /dev/null
+++ b/src/gpu/cl/operators/ClFlatten.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFlatten.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClReshapeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClReshapeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFlatten.h b/src/gpu/cl/operators/ClFlatten.h
new file mode 100644
index 0000000000..d2ce3b701d
--- /dev/null
+++ b/src/gpu/cl/operators/ClFlatten.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FLATTEN_H
+#define ARM_COMPUTE_CL_FLATTEN_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to flatten a given input */
+class ClFlatten : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             Source tensor to flatten with at least 3 dimensions.
+     *                            The dimensions above the third will be interpreted as batches. Data types supported: All
+     * @param[in] dst             Destination tensor with shape [w*h*d, input_batches] where:
+     *                            w = width input tensor, h = height input tensor and d = depth input tensor.
+     *                            Data type supported: same as @p src
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClFlatten::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FLATTEN_H */
diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp
new file mode 100644
index 0000000000..6790160172
--- /dev/null
+++ b/src/gpu/cl/operators/ClFloor.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFloor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClFloorKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClFloorKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClFloorKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFloor.h b/src/gpu/cl/operators/ClFloor.h
new file mode 100644
index 0000000000..746147335e
--- /dev/null
+++ b/src/gpu/cl/operators/ClFloor.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FLOOR_H
+#define ARM_COMPUTE_CL_FLOOR_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClFloorKernel */
+class ClFloor : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             Source tensor info. Data types supported: F16/F32.
+     * @param[in] dst             Destination tensor info. Data type supported: same as @p src
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClFloor::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FLOOR_H */
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
new file mode 100644
index 0000000000..6969ac8ab3
--- /dev/null
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -0,0 +1,698 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFullyConnected.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+#include "src/gpu/cl/operators/ClFlatten.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+#include "src/gpu/cl/operators/ClMatMul.h"
+#include "src/gpu/cl/operators/ClTranspose.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+// Function to calculate batched tensor shape in format [M, 1, B0, B1 ..] which is the format matmul expects
+inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src)
+{
+    return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation
+}
+
+Status construct_gemmlowp_output_stage(const ITensorInfo       &src,
+                                       const ITensorInfo       &weights,
+                                       const ITensorInfo       &dst,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                       ActivationLayerInfo      activation_info)
+{
+    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset     = 0;
+    gemmlowp_output_stage.gemmlowp_multiplier = 0;
+    gemmlowp_output_stage.gemmlowp_shift      = 0;
+
+    const auto data_type = src.data_type();
+
+    // Configure output stage for quantized case
+    if (is_data_type_quantized_asymmetric(data_type))
+    {
+        const QuantizationInfo        oq_info = dst.quantization_info();
+        const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
+        const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
+        const UniformQuantizationInfo oq_unif = oq_info.uniform();
+
+        const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif;
+
+        const float multiplier        = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
+        PixelValue type_min{};
+        PixelValue type_max{};
+        std::tie(type_min, type_max) = get_min_max(data_type);
+
+        if (activation_info.enabled())
+        {
+            std::tie(type_min, type_max) =
+                get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
+        }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
+        gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
+        gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
+        type_min.get(gemmlowp_output_stage.gemmlowp_min_bound);
+        type_max.get(gemmlowp_output_stage.gemmlowp_max_bound);
+    }
+
+    return Status{};
+}
+
+Status validate_mm(const ITensorInfo             &src,
+                   const ITensorInfo             &weights,
+                   const ITensorInfo             *bias,
+                   const ITensorInfo             &dst,
+                   const FullyConnectedLayerInfo &fc_info,
+                   bool                           use_matmul)
+{
+    // Note : If input is dynamic and data is not batched, use matmul, else use gemm
+    const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+    const bool use_dynamic_gemm =
+        !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
+    const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type());
+
+    if (use_matmul)
+    {
+        const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights);
+
+        // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1]
+        TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape()));
+
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info);
+
+        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst,
+                                                                          kernel_info, fc_info.activation_info)
+                            : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info,
+                                                                      fc_info.activation_info);
+    }
+    else
+    {
+        GEMMLowpOutputStageInfo gemmlowp_output_stage;
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
+
+        const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
+                                             false,                           // is_b_reshaped
+                                             !use_dynamic_gemm,               // reshape_b_only_on_first_run
+                                             0,                               // depth_output_gemm3d
+                                             false,                           // reinterpret_input_as_3d
+                                             fc_info.retain_internal_weights, // retain_internal_weights
+                                             gemmlowp_output_stage,           // gemmlowp_output_stage
+                                             fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                             false,                           // fast_math
+                                             true,                            // broadcast_bias
+                                             ActivationLayerInfo());          // activation_info
+
+        if (is_quantized)
+        {
+            const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
+            const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
+            // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+            // Extract and negate src and weights offset
+            const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset);
+            const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
+
+            // Validate gemmlowp function
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(
+                &src.clone()->set_quantization_info(src_quantization_info),
+                &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info));
+        }
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClFullyConnected::ClFullyConnected()
+    : _convert_weights(nullptr),
+      _flatten(nullptr),
+      _reshape_weights(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _matmul_native_kernel(nullptr),
+      _matmul_lowp_native_kernel(nullptr),
+      _aux_mem(Count)
+{
+}
+
+ClFullyConnected::~ClFullyConnected() = default;
+
+void ClFullyConnected::configure_mm(const CLCompileContext        &compile_context,
+                                    ITensorInfo                   *src,
+                                    ITensorInfo                   *weights,
+                                    ITensorInfo                   *bias,
+                                    ITensorInfo                   *dst,
+                                    const FullyConnectedLayerInfo &fc_info)
+{
+    // If weights are dynamic and matmul is supported use matmul, else use gemm
+    if (_use_matmul)
+    {
+        // Specify whether transpose weights is necessary in matmul info
+        const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights);
+
+        // Note: MatMul does not need offset negation unlike gemm
+        // 1. Change shape when calling matmul to fit batch expectations.
+        _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape()));
+
+        // 2. Use heuristics to get kernel info object
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info);
+
+        // 3. Configure relevant matmul kernel
+        if (_is_quantized)
+        {
+            _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>();
+            _matmul_lowp_native_kernel->set_target(gpu_target);
+            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                                  fc_info.activation_info);
+        }
+        else
+        {
+            _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>();
+            _matmul_native_kernel->set_target(gpu_target);
+            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                             fc_info.activation_info);
+        }
+    }
+    else
+    {
+        // Configure GEMM
+        GEMMLowpOutputStageInfo gemmlowp_output_stage;
+        construct_gemmlowp_output_stage(*src, *weights, *dst, gemmlowp_output_stage, fc_info.activation_info);
+
+        const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
+                                             false,                           // is_b_reshaped
+                                             !_dynamic_gemm,                  // reshape_b_only_on_first_run
+                                             0,                               // depth_output_gemm3d
+                                             false,                           // reinterpret_input_as_3d
+                                             fc_info.retain_internal_weights, // retain_internal_weights
+                                             gemmlowp_output_stage,           // gemmlowp_output_stage
+                                             fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                             false,                           // fast_math
+                                             true,                            // broadcast_bias
+                                             fc_info.activation_info);        // activation_info
+
+        if (_is_quantized)
+        {
+            // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+            // Extract and negate input and weights offset
+            const QuantizationInfo src_quantization_info     = src->quantization_info();
+            const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+            TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
+            TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
+
+            src_info.set_quantization_info(
+                QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
+            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale,
+                                                                -weights_quantization_info.uniform().offset));
+
+            // Configure gemmlowp function
+            _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
+            _mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info);
+        }
+        else
+        {
+            // Configure matrix multiply kernel
+            _mm_gemm = std::make_unique<ClGemm>();
+            _mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info);
+        }
+    }
+}
+
+void ClFullyConnected::configure_conv_fc(const CLCompileContext        &compile_context,
+                                         ITensorInfo                   *src,
+                                         ITensorInfo                   *weights,
+                                         ITensorInfo                   *bias,
+                                         ITensorInfo                   *dst,
+                                         const FullyConnectedLayerInfo &fc_info)
+{
+    // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
+    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) !=
+                          (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for flatten
+    _flattened_src = src->clone()
+                         ->set_is_resizable(true)
+                         .reset_padding()
+                         .set_tensor_shape(compute_flatten_shape(src))
+                         .set_data_layout(DataLayout::NCHW);
+
+    // Configure flatten kernel
+    _flatten = std::make_unique<ClFlatten>();
+    _flatten->configure(compile_context, src, &_flattened_src);
+
+    // Note: if flatten has > 1 dimensions after, these dimensions are batch
+    // Configure matrix multiply kernel
+    configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
+}
+
+void ClFullyConnected::configure_fc_fc(const CLCompileContext        &compile_context,
+                                       ITensorInfo                   *src,
+                                       ITensorInfo                   *weights,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
+                                       const FullyConnectedLayerInfo &fc_info)
+{
+    // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
+    ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1));
+
+    // Configure matrix multiply kernel
+    configure_mm(compile_context, src, weights, bias, dst, fc_info);
+}
+
+void ClFullyConnected::configure(const CLCompileContext &compile_context,
+                                 ITensorInfo            *src,
+                                 ITensorInfo            *weights,
+                                 ITensorInfo            *biases,
+                                 ITensorInfo            *dst,
+                                 FullyConnectedLayerInfo fc_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target());
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
+
+    _transpose_weights  = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+    _is_fc_after_conv   = true;
+    _is_quantized       = is_data_type_quantized_asymmetric(src->data_type());
+    _is_prepared        = fc_info.retain_internal_weights;
+    _weights_to_use     = TensorInfo(*weights);
+    _weights_to_use_idx = ACL_SRC_1;
+
+    // When using dynamic weights - use matmul kernels.
+    // Note: MatMul is not used in the following cases (Gemm is used as fallback) :
+    // 1. When the weights tensor is not dynamic
+    // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched.
+    // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required)
+    const bool is_batched_fc_layer = dst->dimension(1) > 1;
+    _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer &&
+                  !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+    _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    // Check if we have a fully connected layer with batches
+    if (is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        _is_fc_after_conv = src->num_dimensions() > 1;
+    }
+
+    ITensorInfo *weights_used = weights;
+
+    // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op.
+    if (_transpose_weights && !_use_matmul)
+    {
+        // Reshape the weights
+        _reshape_weights = std::make_unique<ClTranspose>();
+        _reshape_weights->configure(compile_context, weights, &_reshaped_weights);
+        weights_used        = &_reshaped_weights;
+        _weights_to_use_idx = offset_int_vec(TransposedWeights);
+    }
+
+    // Convert weights if needed
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Convert weights
+        _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
+        _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(),
+                                    fc_info.weights_trained_layout);
+
+        weights_used         = &_converted_weights;
+        _weights_to_use_idx  = offset_int_vec(ConvertedWeights);
+        _run_convert_weights = true;
+    }
+
+    if (_is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info);
+    }
+    // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
+    _weights_to_use = *weights_used;
+
+    if (_use_matmul)
+    {
+        // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem
+        _aux_mem[ConvertedWeights] =
+            MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
+    }
+    else
+    {
+        // Set auxiliary memory requirements for gemm operators
+        auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
+        for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+        {
+            _aux_mem[i] = gemm_mem_req[i];
+        }
+        if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
+        {
+            // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+            // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
+            _aux_mem[TransposedWeights] = MemoryInfo(
+                offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                    _converted_weights.total_size());
+        }
+        else
+        {
+            // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+
+            _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                     _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
+                                                     _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
+                                                     _converted_weights.total_size());
+        }
+    }
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+}
+
+Status ClFullyConnected::validate(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  const ITensorInfo      *dst,
+                                  FullyConnectedLayerInfo fc_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target());
+
+    const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+    bool       is_fc_after_conv  = true;
+
+    // When using dynamic weights - use matmul kernels.
+    // Note: MatMul does not support broadcasting so fallback with batched cases.
+    const bool is_batched_fc_layer = dst->dimension(1) > 1;
+    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() &&
+                            !is_batched_fc_layer &&
+                            !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+
+    const ITensorInfo &flatten_src      = TensorInfo(src->clone()
+                                                         ->set_is_resizable(true)
+                                                         .reset_padding()
+                                                         .set_tensor_shape(compute_flatten_shape(src))
+                                                         .set_data_layout(DataLayout::NCHW));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul)
+                                               ? TensorInfo(*reshaped_weights.clone())
+                                               : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    const ITensorInfo *src_to_use     = src;
+    const ITensorInfo *weights_to_use = weights;
+
+    if (biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        if (is_data_type_quantized(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+    }
+
+    // Check if FC is after conv (flatten kernel is run in case where FC is after conv.)
+    if (is_batched_fc_layer)
+    {
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        is_fc_after_conv = src->num_dimensions() > 1;
+    }
+
+    // Transpose kernel does not run when matmul is supported as matmul fuses transpose op.
+    if (transpose_weights && !use_matmul)
+    {
+        // Validate reshape weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
+        weights_to_use = &reshaped_weights;
+    }
+
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Validate convert weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
+        weights_to_use = &converted_weights;
+    }
+
+    if (is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled
+        const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1;
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+        // Validate flatten kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
+        src_to_use = &flatten_src;
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled
+        const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1;
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(weight_idx));
+    }
+
+    // Validate matrix multiply kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*src_to_use, *weights_to_use, biases, *dst, fc_info, use_matmul));
+
+    return Status{};
+}
+
+void ClFullyConnected::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    ++_asrt_run_count;
+    ARM_COMPUTE_ERROR_ON(_dynamic_gemm && _asrt_prepare_count != _asrt_run_count);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
+    auto src = tensors.get_const_tensor(ACL_SRC_0);
+
+    CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
+    CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
+
+    // Linearize input if it comes from a convolutional layer
+    if (_is_fc_after_conv)
+    {
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
+        _flatten->run(flatten_pack);
+    }
+
+    ITensorPack gemm_pack = tensors;
+    gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
+    if (_weights_to_use_idx != ACL_SRC_1)
+    {
+        gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
+    }
+
+    // Run MatMul Op
+    if (_use_matmul)
+    {
+        // Run matmul kernels for matrix multiplication
+        if (_is_quantized)
+        {
+            CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true);
+        }
+        else
+        {
+            CLScheduler::get().enqueue_op(*_matmul_native_kernel, gemm_pack, true);
+        }
+    }
+    else
+    {
+        // Run matrix multiply
+        if (_is_quantized)
+        {
+            _mm_gemmlowp->run(gemm_pack);
+        }
+        else
+        {
+            _mm_gemm->run(gemm_pack);
+        }
+    }
+}
+
+void ClFullyConnected::prepare(ITensorPack &tensors)
+{
+    // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed.
+    if (!_is_prepared || _dynamic_gemm)
+    {
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+        ++_asrt_prepare_count;
+        ARM_COMPUTE_ERROR_ON(!_dynamic_gemm && !_use_matmul && _asrt_prepare_count > 1);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
+        auto weights = tensors.get_const_tensor(ACL_SRC_1);
+
+        CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
+        CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
+
+        // Pointer to current weights
+        const ITensor *cur_weights = weights;
+
+        // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose.
+        if (_transpose_weights && !_use_matmul)
+        {
+            // Run reshape weights kernel and mark weights as unused
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
+            _reshape_weights->run(transpose_pack);
+
+            cur_weights->mark_as_unused();
+            cur_weights = reshaped_weights.get();
+        }
+
+        // Convert weights if needed
+        if (_run_convert_weights)
+        {
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
+            _convert_weights->run(convert_pack);
+
+            cur_weights->mark_as_unused();
+            cur_weights = converted_weights.get();
+        }
+
+        ITensorPack gemm_pack = tensors;
+        gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
+
+        // Prepare GEMM prepare and release unused weights
+        if (_dynamic_gemm || !_use_matmul)
+        {
+            if (!_is_quantized)
+            {
+                _mm_gemm->prepare(gemm_pack);
+            }
+            else
+            {
+                _mm_gemmlowp->prepare(gemm_pack);
+            }
+        }
+
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClFullyConnected::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
new file mode 100644
index 0000000000..72884ff7ad
--- /dev/null
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+// Forward declarations
+class ClConvertFullyConnectedWeights;
+class ClFlatten;
+class ClGemm;
+class ClGemmLowpMatrixMultiplyCore;
+class ClTranspose;
+// Kernel Forward Declarations
+namespace kernels
+{
+class ClMatMulNativeKernel;
+class ClMatMulLowpNativeKernel;
+} // namespace kernels
+/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
+ *  -# @ref opencl::ClGemm or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class ClFullyConnected : public IClOperator
+{
+public:
+    ClFullyConnected();
+    ~ClFullyConnected();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights         Weights tensor. The weights must be 2 dimensional.
+     *                             If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
+     *                             If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
+     *                             Data type supported: Same as @p src.
+     * @param[in]  biases          Bias tensor. Can be nullptr. Data type supported:Same as @p src.
+     * @param[out] dst             Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
+     *                             - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
+     *                             - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
+     *                             Data type supported: Same as @p src.
+     * @param[in]  fc_info         (Optional) Fully connected layer additional info
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClFullyConnected::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
+                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+    // Inherited methods overriden
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    void configure_fc_fc(const CLCompileContext        &compile_context,
+                         ITensorInfo                   *src,
+                         ITensorInfo                   *weights,
+                         ITensorInfo                   *bias,
+                         ITensorInfo                   *dst,
+                         const FullyConnectedLayerInfo &fc_info);
+    void configure_conv_fc(const CLCompileContext        &compile_context,
+                           ITensorInfo                   *src,
+                           ITensorInfo                   *weights,
+                           ITensorInfo                   *bias,
+                           ITensorInfo                   *dst,
+                           const FullyConnectedLayerInfo &fc_info);
+    void configure_mm(const CLCompileContext        &compile_context,
+                      ITensorInfo                   *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *bias,
+                      ITensorInfo                   *dst,
+                      const FullyConnectedLayerInfo &fc_info);
+
+private:
+    enum AuxTensorIdx
+    {
+        TransposedWeights = 10,
+        ConvertedWeights  = 11,
+        FlattenedSrc      = 12,
+        Count             = 13
+    };
+
+    std::unique_ptr<ClConvertFullyConnectedWeights> _convert_weights;
+    std::unique_ptr<ClFlatten>                      _flatten;
+    std::unique_ptr<ClTranspose>                    _reshape_weights;
+    std::unique_ptr<ClGemm>                         _mm_gemm;
+    std::unique_ptr<ClGemmLowpMatrixMultiplyCore>   _mm_gemmlowp;
+
+    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel;
+    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel;
+
+    experimental::MemoryRequirements _aux_mem{};
+
+    TensorInfo _flattened_src{};
+    TensorInfo _converted_weights{};
+    TensorInfo _reshaped_weights{};
+    TensorInfo _lhs_to_use{};
+    TensorInfo _weights_to_use{};
+    int        _weights_to_use_idx{ACL_SRC_1};
+
+    bool _run_convert_weights{false};
+    bool _transpose_weights{false};
+    bool _dynamic_gemm{false};
+    bool _use_matmul{false};
+
+    bool _is_fc_after_conv{true};
+    bool _is_quantized{false};
+    bool _is_prepared{false};
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    int _asrt_run_count{};
+    int _asrt_prepare_count{};
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
new file mode 100644
index 0000000000..815c254c69
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -0,0 +1,923 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemm.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
+#include "support/Cast.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
+using namespace arm_compute::experimental;
+using namespace arm_compute::utils::cast;
+using namespace arm_compute::opencl::kernels;
+
+namespace
+{
+inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
+{
+    return kernel_type == CLGEMMKernelType::NATIVE ? false : true;
+}
+//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
+inline CLGEMMKernelType
+auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
+{
+    if (!constant_weights)
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+
+    auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
+    if (bool(gemm_kernel))
+    {
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
+            return gemm_kernel.gemm_type;
+        }
+    }
+    gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
+    return gemm_kernel.gemm_type;
+}
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *c,
+                                                    const ITensorInfo       *output,
+                                                    GEMMKernelInfo           gemm_kernel_info)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo tmp_b_info{};
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    gemm_kernel_info.lhs_info  = lhs_info;
+    gemm_kernel_info.rhs_info  = rhs_info;
+    gemm_kernel_info.has_pad_y = false;
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
+    {
+        return false;
+    }
+    gemm_kernel_info.has_pad_y = true;
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          GEMMKernelInfo               kernel_info,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *c,
+                                          const ITensorInfo           *output)
+{
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
+    if (config)
+    {
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
+        }
+    }
+    config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+
+// Validate lhs_info and rhs_info for reshaped kernel
+inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info,
+                                           const GEMMRHSMatrixInfo &rhs_info,
+                                           const ITensorInfo       *a,
+                                           const ITensorInfo       *b,
+                                           const ITensorInfo       *c,
+                                           const ITensorInfo       *output,
+                                           GEMMKernelInfo           gemm_kernel_info,
+                                           bool                     reinterpret_input_as_3d)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    // Validate reshape LHS kernel
+    auto_init_if_empty(tmp_a_info,
+                       a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
+    if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
+    {
+        return false;
+    }
+
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    gemm_kernel_info.lhs_info = lhs_info;
+    gemm_kernel_info.rhs_info = rhs_info;
+    if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                           rhs_info, gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query,
+                                 GEMMKernelInfo               kernel_info,
+                                 const ITensorInfo           *a,
+                                 const ITensorInfo           *b,
+                                 const ITensorInfo           *c,
+                                 const ITensorInfo           *output,
+                                 bool                         reinterpret_input_as_3d)
+{
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
+    if (config)
+    {
+        if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info,
+                                           reinterpret_input_as_3d))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
+        }
+    }
+    config = auto_heuristics::select_default_gemm_config_reshaped(query);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+        to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+} // namespace
+
+ClGemm::ClGemm()
+    : _reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()),
+      _reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
+      _mm_native_kernel(std::make_unique<ClGemmMatrixMultiplyNativeKernel>()),
+      _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),
+      _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
+      _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
+      _tmp_a(),
+      _tmp_b(),
+      _reshape_b_only_on_first_run(false),
+      _gemm_kernel_type(CLGEMMKernelType::NATIVE),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
+{
+}
+
+void ClGemm::configure_native(const CLCompileContext &compile_context,
+                              ITensorInfo            *a,
+                              ITensorInfo            *b,
+                              ITensorInfo            *c,
+                              ITensorInfo            *output,
+                              float                   alpha,
+                              float                   beta,
+                              const GEMMInfo         &gemm_info)
+{
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    // Set the target for the kernels
+    _mm_native_kernel->set_target(gpu_target);
+
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+
+    // Configure and tune matrix multiply kernel
+    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info,
+                                 kernel_info);
+}
+
+void ClGemm::configure_reshaped(const CLCompileContext &compile_context,
+                                ITensorInfo            *a,
+                                ITensorInfo            *b,
+                                ITensorInfo            *c,
+                                ITensorInfo            *output,
+                                float                   alpha,
+                                float                   beta,
+                                const GEMMInfo         &gemm_info)
+{
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = false;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    // Set the target for the kernels
+    _reshape_lhs_kernel->set_target(gpu_target);
+    _mm_reshaped_kernel->set_target(gpu_target);
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::tie(lhs_info, rhs_info) =
+        auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size},
+                                         kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d());
+
+    _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                   kernel_info);
+
+    // Request memory for LHS and RHS reshape matrix
+    _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                         ITensorInfo            *a,
+                                         ITensorInfo            *b,
+                                         ITensorInfo            *c,
+                                         ITensorInfo            *output,
+                                         float                   alpha,
+                                         float                   beta,
+                                         const GEMMInfo         &gemm_info)
+{
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    // Set the target for the kernels
+    _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output);
+
+    // Transpose matrix
+    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+    // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)
+    // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have
+    // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false
+
+    // Configure matrix multiply kernel with no y padding support
+    kernel_info.has_pad_y = false;
+    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                            kernel_info);
+
+    // Request memory for RHS reshape matrix
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                              ITensorInfo            *a,
+                                              ITensorInfo            *b,
+                                              ITensorInfo            *c,
+                                              ITensorInfo            *output,
+                                              float                   alpha,
+                                              float                   beta,
+                                              const GEMMInfo         &gemm_info)
+{
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    // Set the target for the kernels
+    _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
+    // Force H0 to 4 in order to use the MMUL extension
+    rhs_info.h0 = 4;
+
+    // Reshape Rhs matrix
+    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+    // Configure matrix multiply kernel with no y padding support
+    kernel_info.has_pad_y = false;
+    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info,
+                                                 rhs_info, kernel_info);
+
+    // Request memory for RHS reshape matrix
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+Status ClGemm::validate_native(const ITensorInfo *a,
+                               const ITensorInfo *b,
+                               const ITensorInfo *c,
+                               const ITensorInfo *output,
+                               float              alpha,
+                               float              beta,
+                               const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(
+        a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
+
+    return Status{};
+}
+
+Status ClGemm::validate_reshaped(const ITensorInfo *a,
+                                 const ITensorInfo *b,
+                                 const ITensorInfo *c,
+                                 const ITensorInfo *output,
+                                 float              alpha,
+                                 float              beta,
+                                 const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = false;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+    const auto gemm_config =
+        select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
+
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(
+                                       compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha,
+                                                                             beta, lhs_info, rhs_info, kernel_info));
+
+    return Status{};
+}
+
+Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a,
+                                          const ITensorInfo *b,
+                                          const ITensorInfo *c,
+                                          const ITensorInfo *output,
+                                          float              alpha,
+                                          float              beta,
+                                          const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    const DataType     data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    kernel_info.has_pad_y = false;
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+    kernel_info.has_pad_y = true;
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+    return Status{};
+}
+
+Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               float              alpha,
+                                               float              beta,
+                                               const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    const DataType     data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
+    // Force H0 to 4 in order to use the MMUL extension
+    rhs_info.h0 = 4;
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    kernel_info.has_pad_y = false;
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+    return Status{};
+}
+
+void ClGemm::configure(const CLCompileContext &compile_context,
+                       ITensorInfo            *a,
+                       ITensorInfo            *b,
+                       ITensorInfo            *c,
+                       ITensorInfo            *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));
+    ARM_COMPUTE_LOG_PARAMS(a, b, c, output, alpha, beta, gemm_info);
+
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = gemm_info.retain_internal_weights();
+
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+
+    // Select GEMMType
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size},
+        _reshape_b_only_on_first_run, b->are_values_constant());
+
+    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+    ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+
+    switch (_gemm_kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+        {
+            configure_native(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED:
+        {
+            configure_reshaped(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        {
+            configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+        {
+            configure_reshaped_only_rhs_mmul(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
+    }
+}
+
+Status ClGemm::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
+{
+    // Get the GPU target
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+
+    // Check data type early because the auto_select_gemm_kernel has assertions on supported data types
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+
+    // Select GEMMType
+    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{
+            CLScheduler::get().target(),
+            a->data_type(),
+            m,
+            n,
+            k,
+            batch_size,
+        },
+        gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
+
+    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+    const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+
+    switch (gemm_kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c_to_use, output, alpha, beta, gemm_info));
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+        }
+    }
+
+    return Status{};
+}
+
+void ClGemm::run(ITensorPack &tensors)
+{
+    const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);
+    const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);
+    ITensor       *dst = tensors.get_tensor(ACL_DST);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);
+
+    CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true);
+    CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+
+    // Prepare the consts if needed
+    prepare(tensors);
+
+    // Run matrix multiply kernel
+    switch (_gemm_kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+        {
+            CLScheduler::get().enqueue_op(*_mm_native_kernel, tensors, true);
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED:
+        {
+            // Run interleave kernel
+            ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}};
+            CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
+
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
+                CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
+            }
+            // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts
+            ITensorPack gemm_reshaped_pack(tensors);
+            gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get());
+            gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
+
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
+            {
+                CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
+            }
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
+                CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
+            }
+            // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
+            // Check if the lhs or dst tensors have padding
+            const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;
+            const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;
+            bool               has_pad_y           = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
+
+            // Copy original tensor pack and overwrite rhs with reshaped counterpart
+            ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
+            gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
+
+            if (has_pad_y)
+            {
+                ARM_COMPUTE_ERROR_ON(has_pad_y);
+            }
+            else
+            {
+                CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true);
+            }
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
+                CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
+            }
+            // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
+            // Check if the lhs or dst tensors have padding
+            const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;
+            const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;
+            bool               has_pad_y           = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
+
+            // Copy original tensor pack and overwrite rhs with reshaped counterpart
+            ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
+            gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
+
+            if (has_pad_y)
+            {
+                ARM_COMPUTE_ERROR_ON(has_pad_y);
+            }
+            else
+            {
+                CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_onlyrhs_pack, true);
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
+    }
+}
+
+void ClGemm::prepare(ITensorPack &constants)
+{
+    if (!_is_prepared)
+    {
+        const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);
+        ICLTensor     *rhs_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
+
+        // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
+        if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) &&
+            (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
+        {
+            ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
+
+            CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
+            ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
+
+            ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}};
+            CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
+        }
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClGemm::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
new file mode 100644
index 0000000000..85dc1d6c8f
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemm.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_H
+#define ARM_COMPUTE_CL_GEMM_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/GEMMInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref kernels::ClGemmReshapeLhsMatrixKernel (only if the RESHAPED is selected by the heuristic model)
+ *  -# @ref kernels::ClGemmReshapeRhsMatrixKernel (only if either the RESHAPED or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
+ *  -# @ref kernels::ClGemmMatrixMultiplyNativeKernel (only if NATIVE is selected by the select_gemm_kernel method())
+ *  -# @ref kernels::ClGemmMatrixMultiplyReshapedKernel (only if RESHAPED is selected by the select_gemm_kernel method())
+ *  -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
+ *  -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel (only if RESHAPED_ONLY_RHS_MMUL is selected by the select_gemm_kernel method())
+ */
+class ClGemm : public IClOperator
+{
+public:
+    /** Constructor */
+    ClGemm();
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
+     *
+     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+     *
+     * @note All tensors must have the same data type.
+     *
+     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
+     *
+     * @note Batched GEMM only allows RHS tensor's rank to be <= 3
+     * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  a               First input tensor  (Matrix or Vector A). Data types supported: F16/F32
+     * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a.
+     * @param[in]  c               Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
+     * @param[out] output          Output tensor. Data type supported: same as @p a
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of matrix C
+     * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
+     *                             in case matrix A and matrix B have been already transformed.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClGemm::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    void configure_native(const CLCompileContext &compile_context,
+                          ITensorInfo            *a,
+                          ITensorInfo            *b,
+                          ITensorInfo            *c,
+                          ITensorInfo            *output,
+                          float                   alpha,
+                          float                   beta,
+                          const GEMMInfo         &gemm_info);
+    void configure_reshaped(const CLCompileContext &compile_context,
+                            ITensorInfo            *a,
+                            ITensorInfo            *b,
+                            ITensorInfo            *c,
+                            ITensorInfo            *output,
+                            float                   alpha,
+                            float                   beta,
+                            const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                     ITensorInfo            *a,
+                                     ITensorInfo            *b,
+                                     ITensorInfo            *c,
+                                     ITensorInfo            *output,
+                                     float                   alpha,
+                                     float                   beta,
+                                     const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                          ITensorInfo            *a,
+                                          ITensorInfo            *b,
+                                          ITensorInfo            *c,
+                                          ITensorInfo            *output,
+                                          float                   alpha,
+                                          float                   beta,
+                                          const GEMMInfo         &gemm_info);
+
+    static Status validate_native(const ITensorInfo *a,
+                                  const ITensorInfo *b,
+                                  const ITensorInfo *c,
+                                  const ITensorInfo *output,
+                                  float              alpha,
+                                  float              beta,
+                                  const GEMMInfo    &gemm_info);
+    static Status validate_reshaped(const ITensorInfo *a,
+                                    const ITensorInfo *b,
+                                    const ITensorInfo *c,
+                                    const ITensorInfo *output,
+                                    float              alpha,
+                                    float              beta,
+                                    const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs(const ITensorInfo *a,
+                                             const ITensorInfo *b,
+                                             const ITensorInfo *c,
+                                             const ITensorInfo *output,
+                                             float              alpha,
+                                             float              beta,
+                                             const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                                  const ITensorInfo *b,
+                                                  const ITensorInfo *c,
+                                                  const ITensorInfo *output,
+                                                  float              alpha,
+                                                  float              beta,
+                                                  const GEMMInfo    &gemm_info);
+
+private:
+    enum AuxTensorIdx
+    {
+        LhsReshape = 0,
+        RhsReshape,
+        Count
+    };
+
+private:
+    std::unique_ptr<kernels::ClGemmReshapeLhsMatrixKernel>                  _reshape_lhs_kernel;
+    std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel>                  _reshape_rhs_kernel;
+    std::unique_ptr<kernels::ClGemmMatrixMultiplyNativeKernel>              _mm_native_kernel;
+    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel>            _mm_reshaped_kernel;
+    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel>     _mm_reshaped_only_rhs_kernel;
+    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel> _mm_reshaped_only_rhs_mmul_kernel;
+    TensorInfo                                                              _tmp_a;
+    TensorInfo                                                              _tmp_b;
+    bool                                                                    _reshape_b_only_on_first_run;
+    CLGEMMKernelType                                                        _gemm_kernel_type;
+    bool                                                                    _is_prepared;
+    experimental::MemoryRequirements                                        _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMM_H */
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
new file mode 100644
index 0000000000..55d815a1ef
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+#include "src/gpu/cl/kernels/ClCol2ImKernel.h"
+#include "src/gpu/cl/kernels/ClIm2ColKernel.h"
+#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+using namespace experimental;
+using namespace misc::shape_calculator;
+using namespace utils::cast;
+namespace opencl
+{
+ClGemmConv2d::ClGemmConv2d()
+    : _weights_reshape_kernel(nullptr),
+      _im2col_kernel(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _col2im_kernel(nullptr),
+      _activation_kernel(nullptr),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _fuse_activation(true),
+      _append_bias(false),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
+{
+}
+ClGemmConv2d::~ClGemmConv2d() = default;
+
+void ClGemmConv2d::configure_mm(const ClCompileContext        &compile_context,
+                                const ITensorInfo             *src,
+                                ITensorInfo                   *weights,
+                                ITensorInfo                   *biases,
+                                ITensorInfo                   *dst,
+                                const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                int                            gemm_3d_depth,
+                                const ActivationLayerInfo     &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
+
+    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
+                                         false,                 // is_b_reshaped
+                                         true,                  // reshape_b_only_on_first_run
+                                         gemm_3d_depth,         // depth_output_gemm3d
+                                         _skip_im2col,          // reinterpret_input_as_3d
+                                         false,                 // retain_internal_weights
+                                         gemmlowp_output_stage, // gemmlowp_output_stage
+                                         false,                 // fast_math
+                                         false,                 // fp_mixed_precision
+                                         true,                  // broadcast_bias
+                                         act_info               // activation_info
+    );
+
+    TensorInfo tmp_src{*src};
+    if (_is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = src->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+        tmp_src.set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+        _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
+        _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info);
+
+        // Revert back QuantizatioInfo as weights could be used in other convolution layers
+        weights->set_quantization_info(weights_quantization_info);
+
+        auto mm_mem_req = _mm_gemmlowp->workspace();
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        {
+            _aux_mem[cont] = mm_mem_req[cont];
+        }
+    }
+    else
+    {
+        // Configure matrix multiply function
+        _mm_gemm = std::make_unique<ClGemm>();
+        _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
+        auto mm_mem_req = _mm_gemm->workspace();
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        {
+            _aux_mem[cont] = mm_mem_req[cont];
+        }
+    }
+}
+
+Status ClGemmConv2d::validate_mm(const ITensorInfo             *src,
+                                 const ITensorInfo             *weights,
+                                 const ITensorInfo             *biases,
+                                 const ITensorInfo             *dst,
+                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                 int                            gemm_3d_depth,
+                                 bool                           skip_im2col,
+                                 const ActivationLayerInfo     &act_info)
+{
+    const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+
+    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
+                                         false,                 // is_b_reshaped
+                                         true,                  // reshape_b_only_on_first_run
+                                         gemm_3d_depth,         // depth_output_gemm3d
+                                         skip_im2col,           // reinterpret_input_as_3d
+                                         false,                 // retain_internal_weights
+                                         gemmlowp_output_stage, // gemmlowp_output_stage
+                                         false,                 // fast_math
+                                         false,                 // fp_mixed_precision
+                                         true,                  // broadcast_bias
+                                         act_info               // activation_info
+    );
+
+    if (is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = src->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+        std::unique_ptr<ITensorInfo> src_qa     = src->clone();
+        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+        src_qa->set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights_qa->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+        // Perform validation step on GEMMLowp
+        return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info);
+    }
+    else
+    {
+        // Perform validation step on Matrix multiply function
+        return ClGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
+    }
+}
+
+void ClGemmConv2d::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *src,
+                             ITensorInfo            *weights,
+                             ITensorInfo            *biases,
+                             ITensorInfo            *dst,
+                             const Conv2dInfo       &conv2d_info,
+                             const WeightsInfo      &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
+
+    const DataType   data_type   = src->data_type();
+    const DataLayout data_layout = src->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+    const unsigned int num_kernels   = weights->dimension(idx_kernels);
+
+    const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+    _is_prepared  = weights_info.retain_internal_weights();
+    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    _skip_col2im  = data_layout == DataLayout::NHWC;
+
+    // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
+    _fuse_activation = true;
+
+    const ITensorInfo *gemm_input_to_use  = src;
+    ITensorInfo       *gemm_output_to_use = dst;
+
+    // Get parameters from conv_info
+    unsigned int stride_x        = 0;
+    unsigned int stride_y        = 0;
+    std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
+
+    // Get convolved dimensions
+    unsigned int conv_w      = 0;
+    unsigned int conv_h      = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
+
+    unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
+
+    ITensorInfo *biases_to_use = biases;
+    _append_bias               = false;
+
+    _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>();
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
+    {
+        // num_groups != 1 can only be for NCHW
+        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+        biases_to_use = nullptr;
+        _append_bias  = true;
+        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped,
+                                           conv2d_info.num_groups);
+    }
+    else
+    {
+        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped,
+                                           conv2d_info.num_groups);
+    }
+
+    // Create tensor to store im2col reshaped inputs
+    if (!_skip_im2col)
+    {
+        // Configure and tune im2col. im2col output shape is auto-initialized
+        _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>();
+
+        // Set the GPU target for im2col
+        _im2col_kernel->set_target(CLScheduler::get().target());
+        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height),
+                                  conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
+
+        // Set quantization info
+        _im2col_output.set_quantization_info(src->quantization_info());
+        CLScheduler::get().tune_kernel_static(*_im2col_kernel);
+
+        // Update GEMM input
+        gemm_input_to_use = &_im2col_output;
+    }
+
+    // Create GEMM output tensor
+    if (!_skip_col2im)
+    {
+        TensorShape shape_gemm;
+
+        // If we cannot skip col2im it means we run im2col as well
+        shape_gemm = _im2col_output.tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+
+        _gemm_output = TensorInfo(shape_gemm, 1, data_type);
+        _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+
+        // Update GEMM output
+        gemm_output_to_use = &_gemm_output;
+    }
+
+    GEMMLowpOutputStageInfo gemmlowp_output_stage;
+    gemmlowp_output_stage.type            = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset = 0;
+
+    // Configure output stage for quantized case
+    if (_is_quantized)
+    {
+        const auto         output_quant_info        = (dst->total_size() == 0) ? iq_info : oq_info;
+        const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
+        const unsigned int num_filters              = (is_quantized_per_channel) ? num_kernels : 1;
+
+        gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
+
+        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+        quantization::compute_quantized_multipliers_and_shifts(src, weights, dst,
+                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
+                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
+        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
+
+        PixelValue min_val{};
+        PixelValue max_val{};
+        std::tie(min_val, max_val) = get_min_max(dst->data_type());
+
+        auto min_activation = min_val.get<int32_t>();
+        auto max_activation = max_val.get<int32_t>();
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
+        if (conv2d_info.act_info.enabled())
+        {
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            {
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+            }
+            else
+            {
+                _fuse_activation = false;
+            }
+        }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
+    }
+
+    // Configure and tune GEMM
+    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use,
+                 gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
+
+    if (!_skip_col2im)
+    {
+        // Set the GPU target for col2im
+        _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
+        _col2im_kernel->set_target(CLScheduler::get().target());
+        // Configure and tune Col2Im
+        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h),
+                                  conv2d_info.num_groups);
+        CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
+                             "Output shape does not match the expected one");
+
+    if (!_fuse_activation)
+    {
+        _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
+        _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
+    }
+
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] =
+        MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+}
+
+Status ClGemmConv2d::validate(const ITensorInfo *src,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *dst,
+                              const Conv2dInfo  &conv2d_info,
+                              const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
+
+    if (!is_quantized_per_channel)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8),
+                                    "Grouping (num_groups != 1) is not supported with QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) &&
+                                (src->data_layout() == DataLayout::NCHW));
+
+    const DataLayout data_layout = src->data_layout();
+    const DataType   data_type   = src->data_type();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+    const unsigned int num_kernels   = weights->dimension(idx_kernels);
+
+    TensorInfo         im2col_reshaped_info{};
+    TensorInfo         info_gemm{};
+    TensorInfo         weights_reshaped_info{};
+    const ITensorInfo *gemm_input_to_use  = src;
+    const ITensorInfo *gemm_output_to_use = dst;
+    const ITensorInfo *weights_to_use     = weights;
+    const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
+    const bool         skip_im2col     = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    const bool         skip_col2im     = data_layout == DataLayout::NHWC;
+    bool               fuse_activation = true;
+
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) !=
+                                src->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    // Validate biases
+    if (biases != nullptr)
+    {
+        if (is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    if (conv2d_info.act_info.enabled())
+    {
+        ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a());
+    }
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
+
+    unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
+
+    const ITensorInfo *biases_to_use = biases;
+    bool               append_bias   = false;
+
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
+    {
+        // num_groups != 1 can only be for NCHW
+        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+        biases_to_use = nullptr;
+        append_bias   = true;
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
+    }
+    else
+    {
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
+    }
+
+    weights_to_use = &weights_reshaped_info;
+
+    if (!skip_im2col)
+    {
+        const Size2D kernel_dims(kernel_width, kernel_height);
+
+        // Output tensor auto initialization if not yet initialized
+        TensorShape expected_output_shape =
+            compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation,
+                                      conv2d_info.num_groups == 1, conv2d_info.num_groups);
+
+        auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info,
+                                                      append_bias, conv2d_info.dilation, conv2d_info.num_groups));
+        gemm_input_to_use = &im2col_reshaped_info;
+    }
+
+    // Create GEMM output tensor
+    if (!skip_col2im)
+    {
+        TensorShape shape_gemm;
+
+        shape_gemm = gemm_input_to_use->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+
+        info_gemm = TensorInfo(shape_gemm, 1, data_type);
+        info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+        gemm_output_to_use = &info_gemm;
+    }
+
+    GEMMLowpOutputStageInfo gemmlowp_output_stage;
+    gemmlowp_output_stage.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset          = 0;
+    gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
+
+    if (is_quantized)
+    {
+        const UniformQuantizationInfo iq_info           = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info           = dst->quantization_info().uniform();
+        const auto                    output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info;
+        const unsigned int            num_filters       = (is_quantized_per_channel) ? num_kernels : 1;
+
+        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+        quantization::compute_quantized_multipliers_and_shifts(src, weights, dst,
+                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
+                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
+        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
+
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
+        if (conv2d_info.act_info.enabled())
+        {
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            {
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+            }
+            else
+            {
+                fuse_activation = false;
+            }
+        }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
+    }
+
+    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use,
+                                            gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
+
+    // Validate Col2Im
+    if (!skip_col2im)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
+    }
+
+    // Validate Activation Layer
+    if (!fuse_activation)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
+    }
+
+    return Status{};
+}
+
+void ClGemmConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+    auto src                = tensors.get_const_tensor(ACL_SRC_0);
+    auto biases             = tensors.get_const_tensor(ACL_SRC_2);
+    auto dst                = tensors.get_tensor(ACL_DST);
+    auto gemm_input_to_use  = src;
+    auto gemm_output_to_use = dst;
+
+    CLAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
+    CLAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
+    CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
+
+    // Run im2col
+    if (!_skip_im2col)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
+        CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false);
+        gemm_input_to_use = im2col_output.get();
+    }
+    if (!_skip_col2im)
+    {
+        gemm_output_to_use = gemm_output.get();
+    }
+    ITensorPack pack_mm = tensors;
+    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
+    pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+    if (!_append_bias)
+    {
+        pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases);
+    }
+    pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
+    // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions
+    if (_is_quantized)
+    {
+        // Run gemmlowp
+        _mm_gemmlowp->run(pack_mm);
+    }
+    else
+    {
+        // Run gemm
+        _mm_gemm->run(pack_mm);
+    }
+
+    // Reshape output matrix
+    if (!_skip_col2im)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
+        CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false);
+    }
+
+    //Run Activation Layer if we cannot fuse in GEMM
+    if (!_fuse_activation)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
+        CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false);
+    }
+}
+
+void ClGemmConv2d::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        // Run weights reshaping and mark original weights tensor as unused
+        ICLTensor *weights_reshaped_p =
+            utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+        CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
+        auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        ITensorPack        pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
+
+        if (_append_bias)
+        {
+            const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+            pack.add_const_tensor(TensorType::ACL_BIAS, biases);
+        }
+        CLScheduler::get().enqueue_op(*_weights_reshape_kernel.get(), pack, true);
+        tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+
+        // Prepare GEMM
+        _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
+        _is_prepared = true;
+    }
+}
+experimental::MemoryRequirements ClGemmConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
new file mode 100644
index 0000000000..e8f3147ac3
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+class ClGemm;
+class ClGemmLowpMatrixMultiplyCore;
+namespace kernels
+{
+class ClIm2ColKernel;
+class ClCol2ImKernel;
+class ClWeightsReshapeKernel;
+class ClActivationKernel;
+} // namespace kernels
+
+/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref opencl::kernels::ClIm2ColKernel
+ * -# @ref ClGemm (if the data type is FP32 or FP16)
+ * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref ClGemmLowpOutputStage with QUANTIZE_DOWN_FIXEDPOINT type of quantization (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref opencl::kernels::ClCol2ImKernel (if NCHW data layout)
+ * -# @ref opencl::kernels::ClActivationKernel
+ */
+class ClGemmConv2d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClGemmConv2d();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClGemmConv2d(const ClGemmConv2d &) = delete;
+    /** Default move constructor */
+    ClGemmConv2d(ClGemmConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClGemmConv2d &operator=(const ClGemmConv2d &) = delete;
+    /** Default move assignment operator */
+    ClGemmConv2d &operator=(ClGemmConv2d &&) = default;
+    /**Default destructor */
+    ~ClGemmConv2d();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+     * @param[in]  biases          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+     * @param[out] dst             Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p input.
+     * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
+     * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                             tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClGemmConvolution::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &conv2d_info,
+                           const WeightsInfo &weights_info = WeightsInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    /** Configures the appropriate matrix multiply routine
+     *
+     * @param[in]      compile_context       The compile context to be used.
+     * @param[in]      src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]      weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
+     *                                       QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+     * @param[in]      biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                                       Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+     * @param[in, out] dst                   Output tensor info. Data types supported: same as @p input.
+     * @param[in]      gemmlowp_output_stage GEMMLowp output stage info
+     * @param[in]      gemm_3d_depth         Depth of GEMM 3D
+     * @param[in]      act_info              Activation to apply after the matrix multiplication
+     */
+    void configure_mm(const CLCompileContext        &compile_context,
+                      const ITensorInfo             *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *biases,
+                      ITensorInfo                   *dst,
+                      const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                      int                            gemm_3d_depth,
+                      const ActivationLayerInfo     &act_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
+     *
+     * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
+     *                                  QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+     * @param[in] biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                                  Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+     * @param[in] dst                   Output tensor info. Data types supported: same as @p input.
+     * @param[in] gemmlowp_output_stage GEMMLowp output stage info
+     * @param[in] gemm_3d_depth         Depth of GEMM 3D
+     * @param[in] skip_im2col           Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout.
+     * @param[in] act_info              Activation to apply after the matrix multiplication
+     *
+     * @return a status
+     */
+    static Status validate_mm(const ITensorInfo             *src,
+                              const ITensorInfo             *weights,
+                              const ITensorInfo             *biases,
+                              const ITensorInfo             *dst,
+                              const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                              int                            gemm_3d_depth,
+                              bool                           skip_im2col,
+                              const ActivationLayerInfo     &act_info);
+
+    enum AuxTensorIdx
+    {
+        // ClGemmLowpMatrixMultiplyCore has up to 7 internal tensors
+        Im2ColOutput = 8,
+        WeightsReshaped,
+        GemmOutput,
+        Count
+    };
+
+    std::unique_ptr<kernels::ClWeightsReshapeKernel> _weights_reshape_kernel;
+    std::unique_ptr<kernels::ClIm2ColKernel>         _im2col_kernel;
+    std::unique_ptr<ClGemm>                          _mm_gemm;
+    std::unique_ptr<ClGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
+    std::unique_ptr<opencl::kernels::ClCol2ImKernel> _col2im_kernel;
+    std::unique_ptr<kernels::ClActivationKernel>     _activation_kernel;
+
+    TensorInfo _im2col_output;
+    TensorInfo _weights_reshaped;
+    TensorInfo _gemm_output;
+
+    bool _skip_im2col;
+    bool _skip_col2im;
+    bool _is_quantized;
+    bool _fuse_activation;
+    bool _append_bias;
+    bool _is_prepared;
+
+    experimental::MemoryRequirements _aux_mem;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..71c247de79
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
+using namespace arm_compute::opencl::kernels;
+using namespace arm_compute::experimental;
+
+namespace
+{
+inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
+{
+    switch (kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+        {
+            return true;
+        }
+        default:
+        {
+            return false;
+        }
+    }
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
+inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
+{
+    auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
+    if (bool(gemm_kernel))
+    {
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
+            return gemm_kernel.gemm_type;
+        }
+    }
+    gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
+    return gemm_kernel.gemm_type;
+}
+
+// Validate lhs_info and rhs_info for native kernel
+inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info,
+                                         const GEMMRHSMatrixInfo &rhs_info,
+                                         const ITensorInfo       *a,
+                                         const ITensorInfo       *b,
+                                         const GEMMReshapeInfo   &reshape_info)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo mm_result_s32_info{};
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        mm_result_s32_info,
+        a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
+    // Validate mm kernel
+    // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
+    // NOTE: This assumes:
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
+    if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info,
+                                                             reshape_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query,
+                                                                               const ITensorInfo           *a,
+                                                                               const ITensorInfo           *b,
+                                                                               const GEMMReshapeInfo &reshape_info)
+{
+    auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
+    if (config)
+    {
+        if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
+        }
+    }
+    config = auto_heuristics::select_default_gemm_config_native(query);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ",
+                                              to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *output,
+                                                    unsigned int             m,
+                                                    unsigned int             n,
+                                                    unsigned int             k,
+                                                    bool                     reinterpret_input_as_3d,
+                                                    int                      depth_output_gemm3d)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo tmp_b_info{};
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
+    // NOTE: This assumes:
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    // Since we ignore the output stage, output data type has to be S32 to pass the validation
+    TensorInfo output_info_copy(*output);
+    output_info_copy.set_data_type(DataType::S32);
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                      gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info,
+                                                         const GEMMRHSMatrixInfo &rhs_info,
+                                                         const ITensorInfo       *a,
+                                                         const ITensorInfo       *b,
+                                                         const ITensorInfo       *output,
+                                                         unsigned int             m,
+                                                         unsigned int             n,
+                                                         unsigned int             k,
+                                                         bool                     reinterpret_input_as_3d,
+                                                         int                      depth_output_gemm3d)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo tmp_b_info{};
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
+    // NOTE: This assumes:
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    // Since we ignore the output stage, output data type has to be S32 to pass the validation
+    TensorInfo output_info_copy(*output);
+    output_info_copy.set_data_type(DataType::S32);
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                          gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          bool                         reinterpret_input_as_3d,
+                                          int                          depth_output_gemm3d,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *output)
+{
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
+    if (config)
+    {
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                    query.k, reinterpret_input_as_3d, depth_output_gemm3d))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
+        }
+    }
+    config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+
+// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query,
+                                               bool                         reinterpret_input_as_3d,
+                                               int                          depth_output_gemm3d,
+                                               const ITensorInfo           *a,
+                                               const ITensorInfo           *b,
+                                               const ITensorInfo           *output)
+{
+    ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
+    auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                 query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+
+inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
+{
+    switch (kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+            return false;
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+            return true;
+        default:
+            ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
+    }
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
+    : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
+      _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
+      _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
+      _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
+      _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
+      _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
+      _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),
+      _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),
+      _aux_mem(AuxTensorIdx::Count)
+{
+}
+
+ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;
+
+void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+                                             ITensorInfo            *a,
+                                             ITensorInfo            *b,
+                                             ITensorInfo            *c,
+                                             ITensorInfo            *output,
+                                             const GEMMInfo         &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
+    ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
+
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _a_offset                    = a->quantization_info().uniform().offset;
+    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) &&
+                          is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8;
+    _b_offset  = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
+    _gemm_info = gemm_info;
+
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _mm_native_kernel->set_target(gpu_target);
+    _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
+    _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
+
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
+
+    // Arguments used by GEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+    const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run);
+
+    if (_convert_to_qasymm8)
+    {
+        // Set data type for converted weights
+        _qasymm8_weights = *b;
+        _qasymm8_weights.set_data_type(DataType::QASYMM8);
+        _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP);
+    }
+
+    ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+    {
+        matrix_b = &_tmp_b;
+
+        // Pick up the GEMM configuration
+        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+
+        // Configure reshape RHS kernel
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
+    }
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+    {
+        matrix_b = &_tmp_b;
+
+        // Pick up the GEMM configuration
+        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+
+        // Configure reshape RHS kernel
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
+    }
+
+    // Using default reduction info
+    const GEMMLowpReductionKernelInfo reduction_info{};
+
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0)
+    {
+        _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+        // Configure Matrix B reduction kernel
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b,
+                                           &_vector_sum_col, reduction_info);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if (_b_offset != 0)
+    {
+        _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+        // Configure matrix A reduction kernel
+        _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
+    }
+
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    gemm_kernel_info.a_offset                = _a_offset;
+    gemm_kernel_info.b_offset                = _b_offset;
+    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        // Configure offset contribution kernel
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
+
+        _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
+        _gemm_output_stage_shifts      = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
+
+        GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
+        gemmlowp_output_stage.output_data_type        = a->data_type();
+        if (num_filters == 1)
+        {
+            // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
+            // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
+            gemmlowp_output_stage.is_quantized_per_channel = false;
+        }
+
+        gemm_kernel_info.output_stage = gemmlowp_output_stage;
+
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS &&
+            gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            // Configure and tune matrix multiply kernel with fused output stage
+            _mm_reshaped_only_rhs_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+        }
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL &&
+                 gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            // Configure and tune matrix multiply kernel with fused output stage
+            _mm_reshaped_only_rhs_mmul_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+        }
+        else
+        {
+            _run_output_stage = true;
+
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+            {
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                        gemm_kernel_info);
+            }
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+            {
+                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                             gemm_kernel_info);
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                    _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
+
+                // Configure matrix multiply kernel
+                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info,
+                                             reshape_info);
+
+                _offset_contribution_output_stage_kernel->configure(
+                    compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0),
+                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers,
+                    &_gemm_output_stage_shifts);
+            }
+        }
+    }
+    else
+    {
+        _run_offset_contribution = true;
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
+        }
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
+        }
+        else
+        {
+            // Pick up the GEMM configuration
+            // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
+
+            // Configure matrix multiply kernel
+            _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
+        }
+
+        // Configure offset contribution kernel
+        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                                               a->dimension(0), _a_offset, _b_offset);
+    }
+
+    // Request memory
+    _aux_mem[RhsQAsymm8] =
+        MemoryInfo(offset_int_vec(RhsQAsymm8),
+                   _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                   _qasymm8_weights.total_size());
+    if (is_gemm_reshaped(_gemm_kernel_type))
+    {
+        // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
+        _aux_mem[RhsQAsymm8] =
+            MemoryInfo(offset_int_vec(RhsQAsymm8),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary,
+                       _qasymm8_weights.total_size());
+        _aux_mem[RhsReshape] = MemoryInfo(
+            offset_int_vec(RhsReshape),
+            _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    }
+    if (_a_offset != 0)
+    {
+        _aux_mem[VecSumCol] =
+            MemoryInfo(offset_int_vec(VecSumCol),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                       _vector_sum_col.total_size());
+    }
+    if (_b_offset != 0)
+    {
+        _aux_mem[VecSumRow] =
+            MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    }
+    _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent,
+                                       _gemm_output_stage_multipliers.total_size());
+    _aux_mem[Shifts] =
+        MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
+}
+
+Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    int32_t a_offset = a->quantization_info().uniform().offset;
+    int32_t b_offset = b->quantization_info().uniform().offset;
+
+    const ITensorInfo *matrix_a_info = a;
+
+    TensorInfo        tmp_b_info{};
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
+
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+    bool reshape_matrix_b = is_gemm_reshaped(
+        auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size},
+                                gemm_info.reshape_b_only_on_first_run()));
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) &&
+                              is_data_type_quantized_symmetric(b->data_type()) &&
+                              is_data_type_quantized_asymmetric(a->data_type());
+    TensorInfo weights_info(*b);
+    if (convert_to_qasymm8)
+    {
+        b_offset = -128;
+        weights_info.set_data_type(DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
+    }
+    const ITensorInfo *matrix_b_info = &weights_info;
+    if (reshape_matrix_b)
+    {
+        matrix_b_info = &tmp_b_info;
+
+        // Pick up the GEMM configuration
+        // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+        const auto res = select_default_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+        lhs_info = res.lhs_info;
+        rhs_info = res.rhs_info;
+
+        // Validate reshape RHS kernel
+        auto_init_if_empty(tmp_b_info,
+                           weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
+    }
+
+    TensorInfo info_vector_sum_col{};
+    TensorInfo info_vector_sum_row{};
+
+    const GEMMLowpReductionKernelInfo reduction_info;
+    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+    if (a_offset != 0)
+    {
+        info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
+
+        // Configure Matrix B reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
+    }
+
+    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+    if (b_offset != 0)
+    {
+        info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+        // Configure matrix A reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
+    }
+
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    gemm_kernel_info.a_offset                = a_offset;
+    gemm_kernel_info.b_offset                = b_offset;
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
+
+        const TensorInfo gemm_output_stage_multipliers_shifts_info(
+            TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
+        GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
+        gemmlowp_output_stage.output_data_type        = a->data_type();
+
+        gemm_kernel_info.output_stage = gemmlowp_output_stage;
+        if (reshape_matrix_b &&
+            gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info,
+                &gemm_output_stage_multipliers_shifts_info));
+        }
+        else
+        {
+            TensorInfo mm_result_s32_info{};
+
+            if (reshape_matrix_b)
+            {
+                // Output tensor auto inizialitation if not yet initialized
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, reshape_info))
+                                                           .set_data_type(DataType::S32));
+
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+            }
+            else
+            {
+                // Output tensor auto inizialitation if not yet initialized
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, false, reshape_info))
+                                                           .set_data_type(DataType::S32));
+
+                // Pick up the GEMM configuration
+                // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+                // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+                const auto res = select_default_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+                lhs_info = res.lhs_info;
+                rhs_info = res.rhs_info;
+
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+            }
+
+            // Validate offset contribution kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage,
+                &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info));
+        }
+    }
+    else
+    {
+        if (reshape_matrix_b)
+        {
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info));
+        }
+        else
+        {
+            // Pick up the GEMM configuration
+            // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+            const auto res = select_default_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+            lhs_info = res.lhs_info;
+            rhs_info = res.rhs_info;
+
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+        }
+
+        if (output->total_size() != 0)
+        {
+            // Validate offset contribution kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                c, a_offset, b_offset));
+        }
+    }
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
+{
+    const ITensor *a   = tensors.get_const_tensor(ACL_SRC_0);
+    const ITensor *b   = tensors.get_const_tensor(ACL_SRC_1);
+    const ITensor *c   = tensors.get_const_tensor(ACL_SRC_2);
+    ITensor       *dst = tensors.get_tensor(ACL_DST);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, dst);
+
+    CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
+    CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true);
+    CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true);
+    CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+    CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true);
+    CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true);
+    CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true);
+
+    // Prepare the consts if needed
+    prepare(tensors);
+
+    const ITensor *matrix_a = a;
+    const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
+
+    if (is_gemm_reshaped(_gemm_kernel_type))
+    {
+        matrix_b = tmp_b.get();
+        if (!_reshape_b_only_on_first_run)
+        {
+            // Run reshape matrix B
+            ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                              {TensorType::ACL_DST, tmp_b.get()}};
+            CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
+        }
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+    {
+        ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, vec_sum_col.get()}};
+        CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
+    }
+
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if (_b_offset != 0)
+    {
+        ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}};
+        CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
+    }
+
+    // Run matrix multiply
+    if (is_gemm_reshaped(_gemm_kernel_type))
+    {
+        ITensorPack gemm_reshaped_pack;
+        if (_run_offset_contribution)
+        {
+            gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a},
+                                              {TensorType::ACL_SRC_1, matrix_b},
+                                              {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}});
+        }
+        else
+        {
+            gemm_reshaped_pack = ITensorPack({
+                {TensorType::ACL_SRC, matrix_a},
+                {TensorType::ACL_SRC_1, matrix_b},
+                {TensorType::ACL_BIAS, c},
+                {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+                {TensorType::ACL_SHIFTS, shifts.get()},
+                {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+                {TensorType::ACL_DST, dst},
+            });
+        }
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        {
+            CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+        }
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        {
+            CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Invalid reshaped kernel");
+        }
+    }
+    else
+    {
+        ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a},
+                                        {TensorType::ACL_SRC_1, matrix_b},
+                                        {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}};
+        CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
+    }
+    if (_run_output_stage)
+    {
+        // Run offset contribution/output stage kernel
+        ITensorPack output_stage_pack = {
+            {TensorType::ACL_SRC, res32.get()},
+            {TensorType::ACL_BIAS, c},
+            {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+            {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+            {TensorType::ACL_SHIFTS, shifts.get()},
+            {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+            {TensorType::ACL_DST, dst},
+        };
+        CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
+    }
+    if (_run_offset_contribution)
+    {
+        // Run offset contribution kernel
+        ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst},
+                                           {TensorType::ACL_BIAS, c},
+                                           {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                                           {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}};
+        CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
+    }
+}
+
+void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        auto               b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+        CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
+        CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false);
+
+        ARM_COMPUTE_ERROR_ON_NULLPTR(b);
+
+        if (_convert_to_qasymm8)
+        {
+            ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}};
+            CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
+            b->mark_as_unused();
+        }
+
+        if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
+        {
+            // Run reshape kernel and mark original weights tensor as unused
+            ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, tmp_b.get()}};
+            CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
+            b->mark_as_unused();
+        }
+
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if (_a_offset != 0 && _reshape_b_only_on_first_run)
+        {
+            ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                          {TensorType::ACL_DST, vec_sum_col.get()}};
+            CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
+        }
+
+        // Compute GEMM output multipliers and shifts for output stage
+        {
+            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                           ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                           : 1;
+
+            CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
+            CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
+
+            ICLTensor *multiplier_tensor = multipliers.get();
+            if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
+            {
+                multiplier_tensor->map(CLScheduler::get().queue(), true);
+                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(),
+                            num_filters * sizeof(int32_t));
+                multiplier_tensor->unmap(CLScheduler::get().queue());
+            }
+
+            ICLTensor *shifts_tensor = shifts.get();
+            if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
+            {
+                shifts_tensor->map(CLScheduler::get().queue(), true);
+                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+                shifts_tensor->unmap(CLScheduler::get().queue());
+            }
+        }
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClGemmLowpMatrixMultiplyCore::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
new file mode 100644
index 0000000000..c80dc3a182
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/GEMMInfo.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+// Forward declarations
+class ClCastKernel;
+class ClGemmLowpMatrixMultiplyNativeKernel;
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel;
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel;
+class ClGemmReshapeRhsMatrixKernel;
+class ClGemmLowpMatrixAReductionKernel;
+class ClGemmLowpMatrixBReductionKernel;
+class ClGemmLowpOffsetContributionKernel;
+class ClGemmLowpOffsetContributionOutputStageKernel;
+} // namespace kernels
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
+class ClGemmLowpMatrixMultiplyCore : public IClOperator
+{
+public:
+    ClGemmLowpMatrixMultiplyCore();
+    ~ClGemmLowpMatrixMultiplyCore();
+    /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
+     * |QASYMM8        |QASYMM8            |S32      |S32            |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8        |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
+     *
+     * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
+     *  This kernel performs the following computations:
+     *
+     *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
+     *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
+     *  -# Compute the matrix product of the resulting a * b in int32.
+     *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  a               First input tensor  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[in]  c               Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
+     * @param[out] output          Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE
+     * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                       if the reshape of matrix B should be executed only for the first run
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClGemmLowpMatrixMultiplyCore::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum AuxTensorIdx
+    {
+        ResultS32 = 0,
+        RhsQAsymm8,
+        RhsReshape,
+        VecSumCol,
+        VecSumRow,
+        Multipliers,
+        Shifts,
+        Count
+    };
+
+private:
+    // Kernels used
+    std::unique_ptr<kernels::ClCastKernel>                                      _weights_to_qasymm8;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel>              _mm_native_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>     _mm_reshaped_only_rhs_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel> _mm_reshaped_only_rhs_mmul_kernel;
+    std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel>                      _mtx_b_reshape_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>                  _mtx_a_reduction_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>                  _mtx_b_reduction_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>                _offset_contribution_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+
+    // Temporary tensors
+    TensorInfo _qasymm8_weights{};
+    TensorInfo _vector_sum_col{};
+    TensorInfo _vector_sum_row{};
+    TensorInfo _tmp_b{};
+    TensorInfo _mm_result_s32{};
+    TensorInfo _gemm_output_stage_multipliers{};
+    TensorInfo _gemm_output_stage_shifts{};
+
+    int32_t          _a_offset{0};
+    int32_t          _b_offset{0};
+    bool             _reshape_b_only_on_first_run{false};
+    bool             _run_output_stage{false};
+    bool             _convert_to_qasymm8{false};
+    bool             _run_offset_contribution{false};
+    bool             _is_prepared{false};
+    GEMMInfo         _gemm_info{};
+    CLGEMMKernelType _gemm_kernel_type{};
+
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
new file mode 100644
index 0000000000..e3363e3685
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClGemmLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ITensorInfo             *src,
+                                      const ITensorInfo             *bias,
+                                      ITensorInfo                   *dst,
+                                      const GEMMLowpOutputStageInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
+
+    switch (info.type)
+    {
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+        {
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel>();
+            k->configure(compile_context, src, bias, dst, &info);
+            _kernel = std::move(k);
+            break;
+        }
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+        {
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel>();
+            k->configure(compile_context, src, bias, dst, &info);
+            _kernel = std::move(k);
+            break;
+        }
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+        {
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel>();
+            k->configure(compile_context, src, bias, dst, &info);
+            _kernel = std::move(k);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
+    }
+}
+
+Status ClGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *dst,
+                                       const GEMMLowpOutputStageInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
+
+    switch (info.type)
+    {
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info);
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info);
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(src, bias, dst, &info);
+        default:
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
+    }
+}
+
+void ClGemmLowpOutputStage::run(ITensorPack &tensors)
+{
+    const ITensor *src  = tensors.get_const_tensor(ACL_SRC);
+    const ITensor *bias = tensors.get_const_tensor(ACL_BIAS);
+    ITensor       *dst  = tensors.get_tensor(ACL_DST);
+
+    ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}};
+    CLScheduler::get().enqueue_op(*_kernel, pack, true);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
new file mode 100644
index 0000000000..6357e0200b
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+/** This file contains all available output stages for GEMMLowp on OpenCL.
+ *
+ *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
+ *  and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ *
+ *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute GEMMLowpQuantizeDown kernels on CL.
+ *
+ *  This function calls the following CL kernels:
+ *
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
+*/
+class ClGemmLowpOutputStage : public IClOperator
+{
+public:
+    /** Constructor */
+    ClGemmLowpOutputStage() = default;
+    /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst           |
+     * |:--------------|:-------------|:-------------|
+     * |S32            |S32           |QASYMM8       |
+     * |S32            |S32           |QASYMM8_SIGNED|
+     * |S32            |S32           |QSYMM16       |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+     * @param[out] dst             Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  info            GEMMLowp output stage metadata.
+     */
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClGemmLowpOutputStage::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H */
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp
new file mode 100644
index 0000000000..777fc9e5e1
--- /dev/null
+++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
+#include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h"
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
+
+using namespace arm_compute::cl_indirect_conv;
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::experimental;
+
+namespace
+{
+DirectConvComputeKernelInfo
+config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+{
+    // Get GPU target
+    GPUTarget gpu_target = CLScheduler::get().target();
+
+    std::unique_ptr<IClIndirectConvKernelConfig> t = ClIndirectConvKernelConfigurationFactory::create(gpu_target);
+
+    return t->configure(src, weights, conv_info);
+}
+
+} // namespace
+
+void ClIndirectConv2d::configure(const CLCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
+
+    // Reuse the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info);
+
+    // Configure indirect convolution kernels
+    auto k0 = std::make_unique<kernels::ClIndirectConv2dAddressPrecalculationKernel>();
+    auto k1 = std::make_unique<kernels::ClIndirectConv2dKernel>();
+
+    k0->set_target(CLScheduler::get().target());
+    k1->set_target(CLScheduler::get().target());
+
+    k0->configure(compile_context, src, weights, &_indirect_buffer, conv_info, desc);
+    k1->configure(compile_context, src, weights, biases, &_indirect_buffer, dst, conv_info, act_info, desc);
+
+    _addr_precalculation_kernel = std::move(k0);
+    _indirect_conv_kernel       = std::move(k1);
+    _is_prepared                = false;
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel);
+
+    // Request memory for the indirect buffer
+    _aux_mem[IndirectBuffer] =
+        MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
+}
+
+Status ClIndirectConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info)
+{
+    // Initialize the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info);
+
+    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
+
+    TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(
+        src, weights, &indirect_buffer, conv_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst,
+                                                                          conv_info, act_info, desc));
+
+    return Status{};
+}
+
+void ClIndirectConv2d::run(ITensorPack &tensors)
+{
+    CLAuxTensorHandler indirect_buffer(offset_int_vec(IndirectBuffer), _indirect_buffer, tensors, true);
+
+    prepare(tensors);
+
+    ITensorPack indirect_conv2d_pack(tensors);
+    indirect_conv2d_pack.add_const_tensor(ACL_SRC_3, indirect_buffer.get());
+
+    // Run indirect convolution
+    CLScheduler::get().enqueue_op(*_indirect_conv_kernel, indirect_conv2d_pack, true);
+}
+
+void ClIndirectConv2d::prepare(ITensorPack &constants)
+{
+    if (!_is_prepared)
+    {
+        ICLTensor *indirect_buffer_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
+        ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr);
+
+        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer");
+
+        CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux);
+        ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr);
+
+        ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}};
+        CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true);
+
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClIndirectConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h
new file mode 100644
index 0000000000..29e796efd9
--- /dev/null
+++ b/src/gpu/cl/operators/ClIndirectConv2d.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_INDIRECT_CONV2D_H
+#define ARM_COMPUTE_CL_INDIRECT_CONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declaration
+struct DirectConvComputeKernelInfo;
+
+namespace opencl
+{
+/** Basic function to execute indirect convolution on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref kernels::ClIndirectConv2dAddressPrecalculationKernel
+ *  -# @ref kernels::ClIndirectConv2dKernel
+ */
+class ClIndirectConv2d : public IClOperator
+{
+public:
+    ClIndirectConv2d() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
+     *
+     * @note All tensors must have the same data type.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src,
+     *                             while every optional dimension from 4 and above represent a batch of sources.
+     *                             Data types supported: F16/F32.
+     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions. Data type supported:Same as @p src.
+     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p src data type.
+     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst, while the rest represent batch of destinations.
+     *                             Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     *
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClIndirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum AuxTensorIdx
+    {
+        IndirectBuffer = 0,
+        Count
+    };
+
+    std::unique_ptr<IClKernel>       _indirect_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{nullptr};
+    TensorInfo                       _indirect_buffer{};
+    bool                             _is_prepared{false};
+    experimental::MemoryRequirements _aux_mem{Count};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_INDIRECT_CONV2D_H */
diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp
new file mode 100644
index 0000000000..d8d4186d00
--- /dev/null
+++ b/src/gpu/cl/operators/ClLogicalNot.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClLogicalNot.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT);
+    _kernel = std::move(k);
+}
+
+Status ClLogicalNot::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOGICAL_NOT);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClLogicalNot.h b/src/gpu/cl/operators/ClLogicalNot.h
new file mode 100644
index 0000000000..31d4a99be6
--- /dev/null
+++ b/src/gpu/cl/operators/ClLogicalNot.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_LOGICAL_NOT_H
+#define ARM_COMPUTE_CL_LOGICAL_NOT_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClElementWiseUnaryKernel for NOT operation */
+class ClLogicalNot : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: U8.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClLogicalNot::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_LOGICAL_NOT_H */
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
new file mode 100644
index 0000000000..28a2aa2540
--- /dev/null
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClMatMul.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+
+using namespace arm_compute::cl_matmul;
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::opencl::kernels;
+
+ClMatMul::ClMatMul()
+{
+}
+
+Status ClMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *dst,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+
+    const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
+
+    const auto             kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target);
+    const MatMulKernelType kernel_type     = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info);
+
+    switch (kernel_type)
+    {
+        case MatMulKernelType::NATIVE_FP:
+            return ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        case MatMulKernelType::NATIVE_MMUL_FP:
+            return ClMatMulNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info);
+        case MatMulKernelType::NATIVE_QUANTIZED:
+            return ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        case MatMulKernelType::NATIVE_MMUL_QUANTIZED:
+            return ClMatMulLowpNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        default:
+            ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!");
+    }
+}
+
+void ClMatMul::configure(const CLCompileContext    &compile_context,
+                         ITensorInfo               *lhs,
+                         ITensorInfo               *rhs,
+                         ITensorInfo               *dst,
+                         const MatMulInfo          &matmul_info,
+                         const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info));
+
+    const GPUTarget        gpu_target    = CLScheduler::get().target();
+    const auto             kernel_config = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+    const MatMulKernelInfo kernel_info   = kernel_config->configure(lhs, rhs, matmul_info);
+
+    const auto             kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target);
+    const MatMulKernelType kernel_type     = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info);
+
+    switch (kernel_type)
+    {
+        case MatMulKernelType::NATIVE_FP:
+        {
+            auto kernel = std::make_unique<ClMatMulNativeKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_MMUL_FP:
+        {
+            auto kernel = std::make_unique<ClMatMulNativeMMULKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_QUANTIZED:
+        {
+            auto kernel = std::make_unique<ClMatMulLowpNativeKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_MMUL_QUANTIZED:
+        {
+            auto kernel = std::make_unique<ClMatMulLowpNativeMMULKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!");
+    }
+}
+
+void ClMatMul::run(ITensorPack &tensors)
+{
+    CLScheduler::get().enqueue_op(*_matmul_kernel, tensors, /* flush */ true);
+}
+
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
new file mode 100644
index 0000000000..1733def21c
--- /dev/null
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic operator to execute BatchMatMul on OpenCL. This operator calls the following OpenCL kernels:
+ *
+ *  -# @ref kernels::ClMatMulNativeKernel
+ */
+class ClMatMul : public IClOperator
+{
+public:
+    /** Constructor */
+    ClMatMul();
+    /** Default destructor */
+    ~ClMatMul() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |F32            |F32            |
+     * |F16            |F16            |F16            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     *
+     * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B
+     *                    and stores the result in the dst tensor of the same batch size.
+     *                    Batch here is number of slices from A and B multiplied at a time, do not confuse with the batch dimension 'N' of NHWC/NCHW
+     *                    For NHWC for example: the batch is the higher dimensions H * N, and in general it is H * all higher dimensions.
+     * @note All tensors must have the same data type.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  lhs             Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs             Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+     * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  act_info        Class containing information about fused activation function.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &matmul_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMatMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<opencl::IClKernel> _matmul_kernel{nullptr};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp
new file mode 100644
index 0000000000..10cf8a6a38
--- /dev/null
+++ b/src/gpu/cl/operators/ClMul.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClMul.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClMulKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClMul::configure(const CLCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      float                      scale,
+                      ConvertPolicy              overflow_policy,
+                      RoundingPolicy             rounding_policy,
+                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+    auto k = std::make_unique<kernels::ClMulKernel>();
+    k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClMul::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
+                       const ActivationLayerInfo &act_info)
+{
+    return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void ClComplexMul::configure(const CLCompileContext    &compile_context,
+                             ITensorInfo               *src1,
+                             ITensorInfo               *src2,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
+{
+    auto k = std::make_unique<kernels::ClComplexMulKernel>();
+    k->configure(compile_context, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClComplexMul::validate(const ITensorInfo         *src1,
+                              const ITensorInfo         *src2,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info)
+{
+    return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h
new file mode 100644
index 0000000000..1cf4d68d4c
--- /dev/null
+++ b/src/gpu/cl/operators/ClMul.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_MUL_H
+#define ARM_COMPUTE_CL_MUL_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref opencl::kernels::ClMulKernel */
+class ClMul : public IClOperator
+{
+public:
+    /** Initialise the kernel's sources, dst and convertion policy.
+     *
+     * Valid configurations (src1,src2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]      scale           Scale to apply after multiplication.
+     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
+class ClComplexMul : public IClOperator
+{
+public:
+    /** Initialise the kernel's sources, dst.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            An src tensor info. Data types supported: F16/F32. Number of channels supported: 2.
+     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClComplexMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_MUL_H */
diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp
new file mode 100644
index 0000000000..f3efd00bba
--- /dev/null
+++ b/src/gpu/cl/operators/ClPRelu.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPRelu.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using KernelType = kernels::ClArithmeticKernel;
+void ClPRelu::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *input,
+                        ITensorInfo            *alpha,
+                        ITensorInfo            *output)
+{
+    ARM_COMPUTE_LOG_PARAMS(input, alpha, output);
+    auto k = std::make_unique<KernelType>();
+    k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
+    _kernel = std::move(k);
+}
+
+Status ClPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+    return KernelType::validate(ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
+}
+
+void ClPRelu::run(ITensorPack &tensors)
+{
+    // Output tensor can be given as nullptr for in-place computation.
+    // In this case, get the input tensor and use it as the output tensor.
+    if (tensors.get_tensor(TensorType::ACL_DST) == nullptr)
+    {
+        auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+        ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
+        tensors.add_tensor(TensorType::ACL_DST, src_tensor);
+    }
+    IClOperator::run(tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h
new file mode 100644
index 0000000000..45ce858fb0
--- /dev/null
+++ b/src/gpu/cl/operators/ClPRelu.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_PRELU_H
+#define ARM_COMPUTE_CL_PRELU_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic operator to run @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU
+ *
+ * @note The operator implements an activation layer with the PRELU activation function.
+ */
+class ClPRelu : public IClOperator
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPRelu::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_PRELU_H */
diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp
new file mode 100644
index 0000000000..3851e22b6a
--- /dev/null
+++ b/src/gpu/cl/operators/ClPermute.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPermute.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClPermuteKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPermute::configure(const ClCompileContext  &compile_context,
+                          const ITensorInfo       *src,
+                          ITensorInfo             *dst,
+                          const PermutationVector &perm)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, perm);
+    auto k = std::make_unique<kernels::ClPermuteKernel>();
+    k->configure(compile_context, src, dst, perm);
+    _kernel = std::move(k);
+}
+
+Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+    return kernels::ClPermuteKernel::validate(src, dst, perm);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h
new file mode 100644
index 0000000000..6349358a18
--- /dev/null
+++ b/src/gpu/cl/operators/ClPermute.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_PERMUTE_H
+#define ARM_COMPUTE_CL_PERMUTE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClPermuteKernel */
+class ClPermute : public IClOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs and permute vector
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             The src tensor info. Data types supported: All.
+     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
+     * @param[in] perm            Permutation vector
+     */
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPermute::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_PERMUTE_H */
diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
new file mode 100644
index 0000000000..e4507dc1a1
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool2d.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPool2d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClPool2dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPool2d::configure(const ClCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *dst,
+                         const PoolingLayerInfo &info,
+                         ITensorInfo            *indices)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices);
+
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::ClPool2dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, dst, info, indices);
+    _kernel = std::move(k);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status ClPool2d::validate(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &info,
+                          const ITensorInfo      *indices)
+{
+    return kernels::ClPool2dKernel::validate(src, dst, info, indices);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h
new file mode 100644
index 0000000000..9c2fd1c3f2
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool2d.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL2D_H
+#define ARM_COMPUTE_CL_POOL2D_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::ClPool2d
+ */
+class ClPool2d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClPool2d() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]  info            Pooling layer parameters.
+     * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &info,
+                   ITensorInfo            *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &info,
+                           const ITensorInfo      *indices = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL2D_H */
diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp
new file mode 100644
index 0000000000..d230413659
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool3d.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPool3d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClPool3dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPool3d::configure(const ClCompileContext   &compile_context,
+                         const ITensorInfo        *src,
+                         ITensorInfo              *dst,
+                         const Pooling3dLayerInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
+
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::ClPool3dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, dst, info);
+    _kernel = std::move(k);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status ClPool3d::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &info)
+{
+    return kernels::ClPool3dKernel::validate(src, dst, info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h
new file mode 100644
index 0000000000..9fd78bfd69
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool3d.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL3D_H
+#define ARM_COMPUTE_CL_POOL3D_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::ClPool3d
+ */
+class ClPool3d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClPool3d() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * @note Asymmetric padding is not supported when dimension rounding type == CEIL.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info.
+     * @param[out] dst             Destination tensor info.
+     * @param[in]  info            3d Pooling layer parameters.
+     */
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool3d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &info);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL3D_H */
diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp
new file mode 100644
index 0000000000..8560b5553e
--- /dev/null
+++ b/src/gpu/cl/operators/ClQuantize.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClQuantize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClQuantizeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClQuantizeKernel::validate(src, dst);
+}
+
+void ClQuantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClQuantize.h b/src/gpu/cl/operators/ClQuantize.h
new file mode 100644
index 0000000000..3e50fcefb3
--- /dev/null
+++ b/src/gpu/cl/operators/ClQuantize.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_QUANTIZE_H
+#define ARM_COMPUTE_CL_QUANTIZE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */
+class ClQuantize : public IClOperator
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
+     * @param[out] dst             Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this function
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClQuantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_QUANTIZE_H */
diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp
new file mode 100644
index 0000000000..1dd5b760cb
--- /dev/null
+++ b/src/gpu/cl/operators/ClReshape.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClReshape.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClReshapeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClReshapeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClReshape.h b/src/gpu/cl/operators/ClReshape.h
new file mode 100644
index 0000000000..fee69a1c24
--- /dev/null
+++ b/src/gpu/cl/operators/ClReshape.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_RESHAPE_H
+#define ARM_COMPUTE_CL_RESHAPE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClReshapeKernel */
+class ClReshape : public IClOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data type supported: All
+     * @param[out] output          Output info. Data type supported: Same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClReshape::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_RESHAPE_H */
+\ No newline at end of file
diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp
new file mode 100644
index 0000000000..184e2aa006
--- /dev/null
+++ b/src/gpu/cl/operators/ClScale.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClScale.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClScaleKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClScale::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *src,
+                        ITensorInfo            *dst,
+                        const ScaleKernelInfo  &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
+
+    // Configure Scale kernel
+    auto k = std::make_unique<kernels::ClScaleKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, dst, info);
+    _kernel = std::move(k);
+
+    // Tune kernel
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+    return kernels::ClScaleKernel::validate(src, dst, info);
+}
+
+void ClScale::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h
new file mode 100644
index 0000000000..1427bb4fdc
--- /dev/null
+++ b/src/gpu/cl/operators/ClScale.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SCALE_H
+#define ARM_COMPUTE_CL_SCALE_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref kernels::ClScaleKernel
+ */
+class ClScale : public IClOperator
+{
+public:
+    /** Constructor */
+    ClScale() = default;
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] src             Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    dst             Destination tensor info. Data types supported: Same as @p src
+     *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClScale::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSCALE_H */
diff --git a/src/gpu/cl/operators/ClScatter.cpp b/src/gpu/cl/operators/ClScatter.cpp
new file mode 100644
index 0000000000..a11ecd7e6a
--- /dev/null
+++ b/src/gpu/cl/operators/ClScatter.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClScatter.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClCopyKernel.h"
+#include "src/gpu/cl/kernels/ClFillKernel.h"
+#include "src/gpu/cl/kernels/ClScatterKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::opencl::kernels;
+
+ClScatter::ClScatter()
+{
+}
+
+Status ClScatter::validate(const ITensorInfo *src,
+                           const ITensorInfo *updates,
+                           const ITensorInfo *indices,
+                           const ITensorInfo *dst,
+                           const ScatterInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(updates, indices, dst);
+    if (src != nullptr)
+    {
+        // Check dst/src are same shape and datatype.
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, updates, dst);
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCopyKernel::validate(src, dst)); // Validate Copy kernel
+    }
+    if (src != dst)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClFillKernel::validate(dst, PixelValue(0.0f))); // Validate Fill kernel.
+    }
+
+    return kernels::ClScatterKernel::validate(updates, indices, dst, info);
+}
+
+void ClScatter::configure(const CLCompileContext &compile_context,
+                          const ITensorInfo      *src,
+                          const ITensorInfo      *updates,
+                          const ITensorInfo      *indices,
+                          ITensorInfo            *dst,
+                          const ScatterInfo      &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(updates, indices, dst);
+    ARM_COMPUTE_LOG_PARAMS(src, indices, dst, info);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(src, updates, indices, dst, info));
+    _fill_zero = info.zero_initialization;
+
+    // If necessary, create fill kernel to fill dst tensor.
+    if (_fill_zero)
+    {
+        auto f = std::make_unique<kernels::ClFillKernel>();
+        f->configure(compile_context, dst, PixelValue(0.0f));
+        _fill_kernel = std::move(f);
+    }
+    else if (src != dst) // Check whether copying is necessary
+    {
+        // Fill dst with src copy here.
+        auto j = std::make_unique<kernels::ClCopyKernel>();
+        j->configure(compile_context, src, dst);
+        _copy_kernel = std::move(j);
+        _run_copy    = true;
+    }
+
+    // Configure ClScatterKernel
+    auto k = std::make_unique<kernels::ClScatterKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, updates, indices, dst, info);
+    _scatter_kernel = std::move(k);
+}
+
+void ClScatter::run(ITensorPack &tensors)
+{
+    // Get tensors.
+    auto src     = tensors.get_const_tensor(ACL_SRC_0);
+    auto updates = tensors.get_const_tensor(ACL_SRC_1);
+    auto indices = tensors.get_const_tensor(ACL_SRC_2);
+    auto dst     = tensors.get_tensor(ACL_DST);
+
+    if (_fill_zero)
+    {
+        // Fill destination tensor with 0 values if zero init.
+        ITensorPack fill_pack{{ACL_SRC, dst}};
+        CLScheduler::get().enqueue_op(*_fill_kernel, fill_pack, false);
+    }
+
+    if (_run_copy)
+    {
+        // copy src to dst before scatter op.
+        ITensorPack copy_pack{{ACL_SRC, src}, {ACL_DST, dst}};
+        CLScheduler::get().enqueue_op(*_copy_kernel, copy_pack, false);
+    }
+
+    ITensorPack scatter_pack{{ACL_SRC_0, updates}, {ACL_SRC_1, indices}, {ACL_DST, dst}};
+    CLScheduler::get().enqueue_op(*_scatter_kernel, scatter_pack, false);
+}
+
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScatter.h b/src/gpu/cl/operators/ClScatter.h
new file mode 100644
index 0000000000..a1b32fed45
--- /dev/null
+++ b/src/gpu/cl/operators/ClScatter.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H
+
+#include "arm_compute/function_info/ScatterInfo.h"
+
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+// Forward declaration
+class ClFillKernel;
+class ClScatterKernel;
+class ClCopyKernel;
+
+/** Basic operator to execute Scatter on OpenCL. This operator calls the following OpenCL kernels:
+ *
+ *  -# @ref kernels::ClScatterKernel
+ */
+class ClScatter : public IClOperator
+{
+public:
+    /** Constructor */
+    ClScatter();
+    /** Default destructor */
+    ~ClScatter() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * @note indices must always be S32.
+     * @note Negative indices are treated as out of bounds.
+     * @note src, updates and dst tensors must be same datatype.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source input tensor info. Can be nullptr when using "Add" Scatter Function with zero initialization.
+     * @param[in]  updates         Tensor info for tensor storing update values to use for scatter function. Data types supported: same as @p src.
+     * @param[in]  indices         Tensor info for tensor storing indices to use for scatter function. Data types supported: S32 only.
+     * @param[out] dst             Output tensor to store the result of the Scatter Function. Data types supported: same as @p src and @p updates.
+     * @param[in]  Scatter_info    Contains Scatter operation information described in @ref ScatterInfo.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   const ITensorInfo      *updates,
+                   const ITensorInfo      *indices,
+                   ITensorInfo            *dst,
+                   const ScatterInfo      &Scatter_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClScatter::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *updates,
+                           const ITensorInfo *indices,
+                           const ITensorInfo *dst,
+                           const ScatterInfo &Scatter_info);
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<opencl::IClKernel> _scatter_kernel{nullptr};
+    std::unique_ptr<opencl::IClKernel> _fill_kernel{nullptr};
+    std::unique_ptr<opencl::IClKernel> _copy_kernel{nullptr};
+    bool                               _fill_zero{false};
+    bool                               _run_copy{false};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H
diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
new file mode 100644
index 0000000000..427f6b4f92
--- /dev/null
+++ b/src/gpu/cl/operators/ClSoftmax.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClSoftmax.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace opencl
+{
+
+ClSoftmax::ClSoftmax() : _aux_mem(InternalTensorIdx::COUNT)
+{
+}
+
+void ClSoftmax::configure(const CLCompileContext  &compile_context,
+                          const ITensorInfo       &src,
+                          ITensorInfo             &dst,
+                          const SoftmaxKernelInfo &info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
+
+    auto k = std::make_unique<kernels::ClSoftmaxKernel>();
+    k->configure(compile_context, src, dst, info);
+
+    _tmp_info = k->tmp_tensor_info();
+
+    _kernel = std::move(k);
+
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
+}
+
+Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+{
+    return kernels::ClSoftmaxKernel::validate(src, dst, info);
+}
+
+void ClSoftmax::run(ITensorPack &tensors)
+{
+    CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors);
+
+    tensors.add_tensor(TensorType::ACL_INT_0, tmp.get());
+
+    CLScheduler::get().enqueue_op(*_kernel, tensors, false);
+}
+
+experimental::MemoryRequirements ClSoftmax::workspace() const
+{
+    return _aux_mem;
+}
+
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h
new file mode 100644
index 0000000000..232fcfebd1
--- /dev/null
+++ b/src/gpu/cl/operators/ClSoftmax.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ITensorInfo;
+class ITensorPack;
+struct SoftmaxKernelInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+class ClSoftmaxKernel;
+} // namespace kernels
+class ClSoftmax : public IClOperator
+{
+public:
+    /** Constructor */
+    ClSoftmax();
+    /** Configure the operator
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src
+     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+     */
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClSoftmax::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+
+    void run(ITensorPack &tensors) override;
+
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum InternalTensorIdx
+    {
+        TMP = 0,
+        COUNT,
+    };
+
+    TensorInfo                       _tmp_info{};
+    experimental::MemoryRequirements _aux_mem;
+};
+
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp
new file mode 100644
index 0000000000..5c6d0c3184
--- /dev/null
+++ b/src/gpu/cl/operators/ClSub.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClSub.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClSub::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
+    auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClSub::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
+{
+    return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h
new file mode 100644
index 0000000000..6a97275b86
--- /dev/null
+++ b/src/gpu/cl/operators/ClSub.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SUB_H
+#define ARM_COMPUTE_CL_SUB_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run arithmetic subtraction
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class ClSub : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * Valid configurations (src1,src2) -> dst :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClSub::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_SUB_H */
diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp
new file mode 100644
index 0000000000..28da0d640a
--- /dev/null
+++ b/src/gpu/cl/operators/ClTranspose.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClTranspose.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClTransposeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClTransposeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClTranspose.h b/src/gpu/cl/operators/ClTranspose.h
new file mode 100644
index 0000000000..3642fc23f9
--- /dev/null
+++ b/src/gpu/cl/operators/ClTranspose.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_TRANSPOSE_H
+#define ARM_COMPUTE_CL_TRANSPOSE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClTransposeKernel */
+class ClTranspose : public IClOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             The src tensor info. Data types supported: All.
+     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClTranspose::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_TRANSPOSE_H */
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp
new file mode 100644
index 0000000000..cec438faeb
--- /dev/null
+++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClTransposedConvolution.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
+                                        const ITensorInfo      *input,
+                                        const ITensorInfo      *weights,
+                                        const ITensorInfo      *biases,
+                                        ITensorInfo            *output,
+                                        const PadStrideInfo    &deconv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info);
+    auto kernel_object = std::make_unique<kernels::ClTransposedConvolutionKernel>();
+    kernel_object->set_target(CLScheduler::get().target());
+    kernel_object->configure(compile_context, input, weights, biases, output, deconv_info);
+    _transposed_conv_kernel = std::move(kernel_object);
+}
+
+Status ClTransposedConvolution::validate(const ITensorInfo   *input,
+                                         const ITensorInfo   *weights,
+                                         const ITensorInfo   *biases,
+                                         const ITensorInfo   *output,
+                                         const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
+    return Status{};
+}
+
+void ClTransposedConvolution::run(ITensorPack &tensors)
+{
+    CLScheduler::get().enqueue_op(*_transposed_conv_kernel.get(), tensors, false);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h
new file mode 100644
index 0000000000..660c4f85c1
--- /dev/null
+++ b/src/gpu/cl/operators/ClTransposedConvolution.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H
+#define ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::ClTransposedConvolution
+ */
+class ClTransposedConvolution : public IClOperator
+{
+public:
+    /** Default constructor */
+    ClTransposedConvolution() = default;
+    /** Default Destructor */
+    ~ClTransposedConvolution() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClTransposedConvolution(const ClTransposedConvolution &) = delete;
+    /** Default move constructor */
+    ClTransposedConvolution(ClTransposedConvolution &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClTransposedConvolution &operator=(const ClTransposedConvolution &) = delete;
+    /** Default move assignment operator */
+    ClTransposedConvolution &operator=(ClTransposedConvolution &&) = default;
+
+    /** Set the input, weights, biases and output tensors.
+     *
+     * @note: Only NHWC data layout is supported
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info with dimensions [IFM, width, height, batch]
+     *                             Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  weights         Weight tensor info with dimensions [IFM, width, height, OFM].
+     *                             Data type supported: Same as @p input
+     * @param[in]  biases          (Optional) Biases tensor info. Biases are 1D tensor with dimension [OFM].
+     *                             Data type supported: Should match @p input data type if floating point, otherwise S32.
+     * @param[out] output          Output tensor info with dimensions [OFM, width, height, batch]
+     *                             The 1st dimension must be equal to the 4th dimension of the @p weights tensor.
+     *                             Data types supported: Same as @p input.
+     * @param[in]  deconv_info     Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClTransposedConvolution::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
new file mode 100644
index 0000000000..8ec96b247e
--- /dev/null
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
+#include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
+#include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "support/Cast.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout)
+{
+    Size2D output_tile = Size2D{};
+
+    const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
+
+    // Check if the input spatial dimensions are smaller than 4
+    const bool is_input_lt4_nchw =
+        (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+
+    if (kernel_max_dim == 3U)
+    {
+        if (kernel_dims == Size2D(3U, 3U))
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
+        }
+        else if (kernel_dims == Size2D(3U, 1U))
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
+        }
+        else
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
+        }
+    }
+    else if (kernel_max_dim == 5U)
+    {
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U);
+    }
+    else if (kernel_max_dim == 7U)
+    {
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U);
+    }
+
+    return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+    // Check if we want to configure a Winograd configuration which requires fast math
+    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    std::vector<WinogradConfiguration> fast_math_winograd = {
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))};
+
+    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+                            std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+
+Status validate_arguments(const ITensorInfo         *src,
+                          const ITensorInfo         *weights,
+                          const ITensorInfo         *biases,
+                          const ITensorInfo         *dst,
+                          const PadStrideInfo       &conv_info,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math)
+{
+    // Get indeces for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))),
+        "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))),
+        "Winograd only supports padding up to half kernel size");
+
+    // Check if the Winograd configuration requires fast math
+    if (!enable_fast_math)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+            src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                        "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
+
+    // Validate input transform
+    const TensorShape input0_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
+
+    // Validate filter transform
+    const TensorShape input1_shape =
+        misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
+
+    // Validate batched matrix multiply
+    TensorShape batched_mm_output_shape = input0.tensor_shape();
+    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
+    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f,
+                         GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                  GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
+
+    // Configure output transform
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
+    return Status{};
+}
+
+} // namespace
+
+ClWinogradConv2d::ClWinogradConv2d()
+    : _batched_mm(),
+      _input_transform(std::make_unique<kernels::ClWinogradInputTransformKernel>()),
+      _filter_transform(std::make_unique<kernels::ClWinogradFilterTransformKernel>()),
+      _output_transform(std::make_unique<kernels::ClWinogradOutputTransformKernel>()),
+      _border_handler(),
+      _input0(),
+      _input1(),
+      _batched_mm_output(),
+      _is_prepared(false),
+      _aux_mem()
+{
+}
+
+ClWinogradConv2d::~ClWinogradConv2d() = default;
+
+void ClWinogradConv2d::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math);
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
+
+    // Check if the Winograd configuration requires fast math
+    if (!enable_fast_math)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1,
+                                                      DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                 "This Winograd configuration requires enable_fast_math=true");
+    }
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
+
+    _is_prepared = false;
+
+    // Configure input transform
+    _input_transform->configure(compile_context, src, &_input0, winograd_info);
+    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT,
+                              PixelValue());
+
+    // Configure filter transform
+    _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
+
+    // Configure batched matrix multiply
+    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f,
+                          GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                   GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)));
+
+    // Configure output transform
+    _output_transform->set_target(CLScheduler::get().target());
+    _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
+
+    _aux_mem = _batched_mm.workspace();
+    const MemoryLifetime wino_wei_lifetm =
+        std::any_of(std::begin(_aux_mem), std::end(_aux_mem),
+                    [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); })
+            ? MemoryLifetime::Prepare
+            : MemoryLifetime::Persistent;
+    _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
+    _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
+    _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
+}
+
+Status ClWinogradConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
+    return Status{};
+}
+
+void ClWinogradConv2d::run(ITensorPack &tensors)
+{
+    const bool is_gemm_reshaped = _aux_mem[3].lifetime == MemoryLifetime::Prepare;
+
+    auto src    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst    = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true);
+    CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true, is_gemm_reshaped);
+    CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true);
+
+    prepare(tensors);
+
+    // Run input transform
+    ITensorPack pack_it{
+        {TensorType::ACL_SRC, src},
+        {TensorType::ACL_DST, input0.get()},
+    };
+    CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
+    CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
+
+    // Run batched matrix multiplication
+    ITensorPack pack_mm = tensors;
+    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
+    pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
+    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1)
+                     : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+    _batched_mm.run(pack_mm);
+
+    // Run output transform
+    ITensorPack pack_ot{
+        {TensorType::ACL_SRC_0, batched_mm_output.get()},
+        {TensorType::ACL_SRC_1, biases},
+        {TensorType::ACL_DST, dst},
+    };
+    CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
+}
+
+void ClWinogradConv2d::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        auto weights =
+            utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
+
+        CLAuxTensorHandler input1(_input1, *in1_aux);
+        ITensorPack        pack_ft{
+            {TensorType::ACL_SRC, weights},
+            {TensorType::ACL_DST, input1.get()},
+        };
+        // Run filter transform and mark original weights as unused
+        CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
+        weights->mark_as_unused();
+
+        // Prepare GEMM and release reshaped weights if marked unused by ClGemm
+        ITensorPack mm_prepare_pack = tensors;
+        mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get());
+        _batched_mm.prepare(mm_prepare_pack);
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClWinogradConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h
new file mode 100644
index 0000000000..54ec1a1737
--- /dev/null
+++ b/src/gpu/cl/operators/ClWinogradConv2d.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRADCONV2D_H
+#define ARM_COMPUTE_CL_WINOGRADCONV2D_H
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ITensorInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClWinogradInputTransformKernel;
+class ClWinogradFilterTransformKernel;
+class ClWinogradOutputTransformKernel;
+} // namespace kernels
+/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
+ *
+ *  -# @ref kernels::ClWinogradInputTransformKernel
+ *  -# @ref kernels::ClWinogradFilterTransformKernel (only once)
+ *  -# @ref ClGemm
+ *  -# @ref kernels::ClWinogradOutputTransformKernel
+ *
+ */
+class ClWinogradConv2d : public IClOperator
+{
+public:
+    /** Default constructor */
+    ClWinogradConv2d();
+    /** Default destructor */
+    ~ClWinogradConv2d();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClWinogradConv2d(const ClWinogradConv2d &) = delete;
+    /** Default move constructor */
+    ClWinogradConv2d(ClWinogradConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClWinogradConv2d &operator=(const ClWinogradConv2d &) = delete;
+    /** Default move assignment operator */
+    ClWinogradConv2d &operator=(ClWinogradConv2d &&) = default;
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     *
+     * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
+     * @note  Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: F16/F32.
+     * @param[in]  weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
+     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p src
+     * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p src.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClWinogradConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
+
+    // Inherited method overridden
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    ClGemm                                                    _batched_mm;
+    std::unique_ptr<kernels::ClWinogradInputTransformKernel>  _input_transform;
+    std::unique_ptr<kernels::ClWinogradFilterTransformKernel> _filter_transform;
+    std::unique_ptr<kernels::ClWinogradOutputTransformKernel> _output_transform;
+    CLFillBorderKernel                                        _border_handler;
+    TensorInfo                                                _input0;
+    TensorInfo                                                _input1;
+    TensorInfo                                                _batched_mm_output;
+    bool                                                      _is_prepared;
+    experimental::MemoryRequirements                          _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRADCONV2D_H */
diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h
new file mode 100644
index 0000000000..12226699f8
--- /dev/null
+++ b/src/gpu/cl/utils/ClAuxTensorHandler.h
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 2021, 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_UTILS_CLAUXTENSORHANDLER_H
+#define ACL_SRC_GPU_CL_UTILS_CLAUXTENSORHANDLER_H
+
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/common/utils/Log.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Tensor handler to wrap and handle tensor allocations on workspace buffers
+ *
+ * @note About memory handling using bypass_* flags
+ *       See @ref arm_compute::cpu::CpuAuxTensorHandler
+ */
+class CLAuxTensorHandler
+{
+public:
+    /** Create a temporary tensor handle, by either important an existing tensor from a tensor pack, or allocating a
+     *  new one.
+     *
+     * @param[in]     slot_id       Slot id of the tensor to be retrieved in the tensor pack
+     *                              If no such tensor exists in the tensor pack, a new tensor will be allocated.
+     * @param[in]     info          Tensor info containing requested size of the new tensor.
+     *                              If requested size is larger than the tensor retrieved from the tensor pack,
+     *                              a new tensor will be allocated.
+     * @param[in,out] pack          Tensor pack to retrieve the old tensor. When @p pack_inject is true, the new
+     *                              tensor will also be added here.
+     * @param[in]     pack_inject   In case of a newly allocated tensor, whether to add this tensor back to the
+     *                              @p pack
+     * @param[in]     bypass_alloc  Bypass allocation in case of a new tensor
+     *                              This is to prevent unnecessary memory operations when the handler object is not
+     *                              used
+     * @param[in]     bypass_import Bypass importation in case of a retrieved tensor
+     *                                  This is to prevent unnecessary memory operations when the handler object is not
+     *                                  used
+     */
+    CLAuxTensorHandler(int          slot_id,
+                       TensorInfo  &info,
+                       ITensorPack &pack,
+                       bool         pack_inject   = false,
+                       bool         bypass_alloc  = false,
+                       bool         bypass_import = false)
+        : _tensor()
+    {
+        if (info.total_size() == 0)
+        {
+            return;
+        }
+        _tensor.allocator()->soft_init(info);
+
+        ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
+        if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+        {
+            if (!bypass_alloc)
+            {
+                _tensor.allocator()->allocate();
+                ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
+            }
+
+            if (pack_inject)
+            {
+                pack.add_tensor(slot_id, &_tensor);
+                _injected_tensor_pack = &pack;
+                _injected_slot_id     = slot_id;
+            }
+        }
+        else
+        {
+            if (!bypass_import)
+            {
+                _tensor.allocator()->import_memory(packed_tensor->cl_buffer());
+            }
+        }
+    }
+
+    /** Create a temporary handle to the original tensor with a new @ref TensorInfo
+     * This is useful if we want to change a tensor's tensor info at run time without modifying the original tensor
+     *
+     * @param[in] info          New tensor info to "assign" to @p tensor
+     * @param[in] tensor        Tensor to be assigned a new @ref TensorInfo
+     * @param[in] bypass_import Bypass importing @p tensor's memory into the handler.
+     *                          This is to prevent unnecessary memory operations when the handler object is not used
+     */
+    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor, bool bypass_import = false) : _tensor()
+    {
+        _tensor.allocator()->soft_init(info);
+        if (!bypass_import)
+        {
+            ARM_COMPUTE_ERROR_ON(tensor.info() == nullptr);
+            if (info.total_size() <= tensor.info()->total_size())
+            {
+                _tensor.allocator()->import_memory(tensor.cl_buffer());
+            }
+        }
+    }
+
+    CLAuxTensorHandler(const CLAuxTensorHandler &)          = delete;
+    CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete;
+
+    ~CLAuxTensorHandler()
+    {
+        if (_injected_tensor_pack)
+        {
+            _injected_tensor_pack->remove_tensor(_injected_slot_id);
+        }
+    }
+
+    ICLTensor *get()
+    {
+        return &_tensor;
+    }
+
+    ICLTensor *operator()()
+    {
+        return &_tensor;
+    }
+
+private:
+    CLTensor     _tensor{};
+    ITensorPack *_injected_tensor_pack{nullptr};
+    int          _injected_slot_id{TensorType::ACL_UNKNOWN};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_UTILS_CLAUXTENSORHANDLER_H