76 files changed, 10369 insertions, 0 deletions
diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp
new file mode 100644
index 0000000000..66877ebcec
--- /dev/null
+++ b/src/gpu/cl/operators/ClActivation.cpp
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClActivation.h"
+
+#include "src/common/IOperator.h"
+#include "src/common/utils/LegacySupport.h"
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/ClContext.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClActivation::configure(const ClCompileContext    &compile_context,
+                             ITensorInfo               *src,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, act_info);
+    auto k = std::make_unique<kernels::ClActivationKernel>();
+    k->configure(compile_context, src, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClActivation::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+{
+    return kernels::ClActivationKernel::validate(src, dst, act_info);
+}
+} // namespace opencl
+
+namespace gpu
+{
+namespace opencl
+{
+std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor     &src,
+                                                                 const AclTensorDescriptor     &dst,
+                                                                 const AclActivationDescriptor &act,
+                                                                 bool                           is_validate)
+{
+    TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
+    TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
+    auto       info     = detail::convert_to_activation_info(act);
+
+    if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false),
+                                                                         &dst_info.set_is_resizable(false), info)))
+    {
+        return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
+    }
+
+    auto act_op = std::make_unique<arm_compute::opencl::ClActivation>();
+    act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info);
+
+    auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
+    if (op == nullptr)
+    {
+        ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
+        return std::make_tuple(nullptr, StatusCode::OutOfMemory);
+    }
+    op->set_internal_operator(std::move(act_op));
+
+    return std::make_tuple(op, StatusCode::Success);
+}
+} // namespace opencl
+} // namespace gpu
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h
new file mode 100644
index 0000000000..4f25bb5f24
--- /dev/null
+++ b/src/gpu/cl/operators/ClActivation.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ACTIVATION_H
+#define ARM_COMPUTE_CL_ACTIVATION_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClActivationKernel */
+class ClActivation : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32.
+     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]  activation_info Activation layer parameters.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &activation_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClActivation::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ACTIVATION_H */
diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp
new file mode 100644
index 0000000000..b58d0df58d
--- /dev/null
+++ b/src/gpu/cl/operators/ClAdd.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClAdd.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClAdd::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
+    auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClAdd::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
+{
+    return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h
new file mode 100644
index 0000000000..7aed902f5d
--- /dev/null
+++ b/src/gpu/cl/operators/ClAdd.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ADD_H
+#define ARM_COMPUTE_CL_ADD_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run arithmetic addition
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic addition between two tensors.
+ */
+class ClAdd : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * Valid configurations (src1,src2) -> dst :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClAdd::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ADD_H */
diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
new file mode 100644
index 0000000000..8f26ef003d
--- /dev/null
+++ b/src/gpu/cl/operators/ClCast.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCast.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCast::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       ConvertPolicy           policy)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
+    auto k = std::make_unique<kernels::ClCastKernel>();
+    k->configure(compile_context, src, dst, policy);
+    _kernel = std::move(k);
+}
+
+Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
+{
+    return kernels::ClCastKernel::validate(src, dst, policy);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h
new file mode 100644
index 0000000000..25d2293673
--- /dev/null
+++ b/src/gpu/cl/operators/ClCast.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CAST_H
+#define ARM_COMPUTE_CL_CAST_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCastKernel */
+class ClCast : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @note Input data type must be different than output data type.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst                                    |
+     * |:--------------|:--------------------------------------|
+     * |U8             | S8, U16, S16, U32, S32, F16, F32      |
+     * |U16            | U8, S8, S16, U32, S32, F16, F32       |
+     * |S16            | U8, S8, U16, U32, S32, F16, F32       |
+     * |U32            | U8, S8, U16, S16, S32, F16, F32       |
+     * |S32            | U8, S8, U16, S16, U32, F16, F32       |
+     * |F16            | U8, S8, U16, S16, U32, F32            |
+     * |F32            | U8, S8, U16, S16, U32, F16            |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  policy          Conversion policy.
+     */
+    void
+    configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCast::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CAST_H */
diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp
new file mode 100644
index 0000000000..31018b9768
--- /dev/null
+++ b/src/gpu/cl/operators/ClConcatenate.cpp
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConcatenate.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
+#include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClConcatenate::configure(const CLCompileContext           &compile_context,
+                              const std::vector<ITensorInfo *> &src_vector,
+                              ITensorInfo                      *dst,
+                              size_t                            axis)
+{
+    ARM_COMPUTE_ERROR_ON(dst == nullptr);
+    ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis);
+    _axis       = axis;
+    _num_inputs = src_vector.size();
+
+    TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+    std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
+    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(),
+                   [](ITensorInfo *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t;
+                   });
+
+    // dst auto inizialitation if not yet initialized
+    auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
+    ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
+
+    unsigned int offset = 0;
+    switch (_axis)
+    {
+        case Window::DimX:
+        {
+            switch (_num_inputs)
+            {
+                case 2:
+                {
+                    // Configure WidthConcatenate2Tensors kernel
+                    auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>();
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                case 4:
+                {
+                    // Configure WidthConcatenate4Tensors kernel
+                    auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2),
+                                      src_vector.at(3), dst);
+                    _concat_kernels.emplace_back(std::move(kernel));
+                    break;
+                }
+                default:
+                {
+                    // Configure generic case WidthConcatenate kernels
+                    for (unsigned int i = 0; i < _num_inputs; ++i)
+                    {
+                        auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
+                        kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                        offset += src_vector.at(i)->dimension(_axis);
+                        _concat_kernels.emplace_back(std::move(kernel));
+                    }
+                    break;
+                }
+            }
+            break;
+        }
+        case Window::DimY:
+        {
+            for (unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for (unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        case 3:
+        {
+            for (unsigned int i = 0; i < _num_inputs; ++i)
+            {
+                auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
+                kernel->configure(compile_context, src_vector.at(i), offset, dst);
+                offset += src_vector.at(i)->dimension(_axis);
+                _concat_kernels.emplace_back(std::move(kernel));
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+}
+
+Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr);
+    const unsigned int num_inputs = src_vector.size();
+
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
+
+    unsigned int offset = 0;
+    switch (axis)
+    {
+        case Window::DimX:
+        {
+            switch (num_inputs)
+            {
+                case 2:
+                    // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(
+                        kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+                    break;
+                case 4:
+                    // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
+                    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(
+                        src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+                    break;
+                default:
+                    // Validate generic case of WidthConcatenate kernel
+                    for (const auto &src : src_vector)
+                    {
+                        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+                        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
+                        offset += src->dimension(axis);
+                    }
+                    break;
+            }
+            break;
+        }
+        case Window::DimY:
+        {
+            for (const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        case Window::DimZ:
+        {
+            for (const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        case 3:
+        {
+            for (const auto &src : src_vector)
+            {
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
+                offset += src->dimension(axis);
+            }
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Axis not supported");
+    }
+
+    if (dst->total_size() != 0)
+    {
+        TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
+        ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
+    }
+
+    return Status{};
+}
+
+void ClConcatenate::run(ITensorPack &tensors)
+{
+    if (tensors.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    {
+        ARM_COMPUTE_ERROR("Configured with different number of inputs");
+    }
+
+    if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    {
+        ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
+        CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
+    }
+    else
+    {
+        int i = 0;
+        for (auto &k : _concat_kernels)
+        {
+            ITensorPack pack;
+            pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));
+            pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST));
+            CLScheduler::get().enqueue_op(*k, pack, true);
+            ++i;
+        }
+    }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h
new file mode 100644
index 0000000000..d8ce9d2a5c
--- /dev/null
+++ b/src/gpu/cl/operators/ClConcatenate.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONCATENATE_H
+#define ARM_COMPUTE_CLCONCATENATE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels:
+ *
+ * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0).
+ * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1).
+ * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2).
+ * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3).
+ */
+class ClConcatenate : public IClOperator
+{
+public:
+    ClConcatenate() = default;
+    /** Initialise the kernel's inputs vector and dst.
+     *
+     * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis.
+     * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel,
+     *       @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel.
+     *
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] src_vector      The vectors containing all the tensors info to concatenate. Data types supported: All
+     * @param[out]    dst             Destination tensor info. Data types supported: same as @p src_vector.
+     * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
+     */
+    void configure(const ClCompileContext           &compile_context,
+                   const std::vector<ITensorInfo *> &src_vector,
+                   ITensorInfo                      *dst,
+                   size_t                            axis);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClConcatenate::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::vector<std::unique_ptr<IClKernel>> _concat_kernels{};
+    unsigned int                            _num_inputs{0};
+    unsigned int                            _axis{0};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CONCATENATE_H */
diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
new file mode 100644
index 0000000000..2c3b0214fa
--- /dev/null
+++ b/src/gpu/cl/operators/ClConv2d.cpp
@@ -0,0 +1,432 @@
+/*
+ * Copyright (c) 2021-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
+
+#include <memory>
+
+namespace
+{
+/** Get the suitable kernel size for using direct convolution method with NHWC data layout.
+ *
+ * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function
+ *
+ * @param[in] gpu_target GPU target
+ *
+ * @return the suitable kernel size for using direct convolution method with NHWC data layout
+ */
+size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
+{
+    switch (gpu_target)
+    {
+        case arm_compute::GPUTarget::G76:
+        case arm_compute::GPUTarget::G77:
+        case arm_compute::GPUTarget::G78:
+            return 5;
+        case arm_compute::GPUTarget::G71:
+        case arm_compute::GPUTarget::G72:
+        case arm_compute::GPUTarget::MIDGARD:
+        case arm_compute::GPUTarget::BIFROST:
+            return 7;
+        default:
+            return 5;
+    }
+}
+} // namespace
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+
+ClConv2d::ClConv2d() : _operator()
+{
+}
+
+ClConv2d::~ClConv2d() = default;
+
+void ClConv2d::configure(const CLCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *weights,
+                         ITensorInfo            *biases,
+                         ITensorInfo            *dst,
+                         const Conv2dInfo       &conv2d_info,
+                         const WeightsInfo      &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
+
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
+    {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
+            auto f = std::make_unique<ClWinogradConv2d>();
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info,
+                         conv2d_info.enable_fast_math);
+            _operator = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::DIRECT:
+        {
+            ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
+            auto f = std::make_unique<ClDirectConv2d>();
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
+            _operator = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::INDIRECT:
+        {
+            ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
+            auto f = std::make_unique<ClIndirectConv2d>();
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info);
+            _operator = std::move(f);
+            break;
+        }
+        case ConvolutionMethod::GEMM:
+        {
+            auto f = std::make_unique<ClGemmConv2d>();
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info, weights_info);
+            _operator = std::move(f);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+    _aux_mem = _operator->workspace();
+}
+
+Status ClConv2d::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv2dInfo  &conv2d_info,
+                          const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
+
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
+    {
+        case ConvolutionMethod::WINOGRAD:
+        {
+            //Validate Winograd
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info,
+                                                                   conv2d_info.act_info, conv2d_info.enable_fast_math));
+            break;
+        }
+        case ConvolutionMethod::DIRECT:
+        {
+            // Validate direct convolution layer
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            break;
+        }
+        case ConvolutionMethod::INDIRECT:
+        {
+            // Validate indirect convolution layer
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            break;
+        }
+        case ConvolutionMethod::GEMM:
+        {
+            // Validate gemm-based convolution layer
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Not supported.");
+            break;
+    }
+
+    return Status{};
+}
+
+ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src,
+                                                   const ITensorInfo *weights,
+                                                   const ITensorInfo *dst,
+                                                   const Conv2dInfo  &conv2d_info,
+                                                   const WeightsInfo &weights_info,
+                                                   const GPUTarget    gpu_target)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(weights);
+    ARM_COMPUTE_UNUSED(weights_info);
+
+    const PadStrideInfo       conv_info        = conv2d_info.conv_info;
+    const ActivationLayerInfo act_info         = conv2d_info.act_info;
+    const Size2D              dilation         = conv2d_info.dilation;
+    bool                      enable_fast_math = conv2d_info.enable_fast_math;
+
+    const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+
+    /* Input spatial dims, kernel size, IFM/OFM, conv info*/
+    using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
+    using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
+
+    const std::vector<ConfigurationMethod> known_configs = {
+        // Alexnet
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
+        // VGG16 / VGG19
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
+        // Mobilenet 224
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
+        // Mobilenet 160
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
+    };
+
+    const auto find_config = [&](ConfigurationMethod c)
+    {
+        const ConvolutionConfiguration config      = c.first;
+        const PadStrideInfo            info        = std::get<3>(config);
+        const DataLayout               data_layout = std::get<4>(config);
+
+        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride() && (data_layout == src->data_layout());
+    };
+
+    std::vector<ConfigurationMethod>::const_iterator found;
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    {
+        return (*found).second;
+    }
+
+    if (dilation != Size2D(1U, 1U))
+    {
+        return ConvolutionMethod::GEMM;
+    }
+    else
+    {
+        if (src->data_layout() == DataLayout::NCHW)
+        {
+            // SRGAN
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) &&
+                (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
+            {
+                return ConvolutionMethod::DIRECT;
+            }
+            if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) &&
+                (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
+            {
+                return ConvolutionMethod::FFT;
+            }
+            if (src->dimension(idx_c) < 16)
+            {
+                return ConvolutionMethod::GEMM;
+            }
+            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))
+                       ? ConvolutionMethod::WINOGRAD
+                       : ConvolutionMethod::GEMM;
+        }
+        else
+        {
+            const bool is_direct_valid =
+                bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+            const bool is_wino_valid =
+                bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
+            const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
+
+            // SRGAN case
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) && is_direct_valid)
+            {
+                return ConvolutionMethod::DIRECT;
+            }
+
+            // Floating-point case: GeMM/Direct/Winograd
+            if (is_data_type_float(src->data_type()))
+            {
+                // Get dst shape
+                TensorShape output_shape =
+                    misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) &&
+                                                (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool is_ifm_ge_8       = src->dimension(idx_c) >= 8;
+                const bool is_ifm_ge_16      = src->dimension(idx_c) >= 16;
+                const bool is_ofm_lte_8      = weights->dimension(3U) <= 8;
+                const bool is_ofm_lt_64      = weights->dimension(3U) < 64;
+                const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool is_ifm_gt_ofm     = src->dimension(idx_c) > weights->dimension(3U);
+                const bool is_m_one          = output_shape[1] * output_shape[2] == 1;
+                const bool is_unit_stride =
+                    (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+                const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
+
+                // Run Winograd if valid and IFM >= 8
+                if (is_wino_valid && is_ifm_ge_8)
+                {
+                    if (is_ofm_lte_8)
+                    {
+                        if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                            get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
+                        {
+                            return ConvolutionMethod::WINOGRAD;
+                        }
+                    }
+                    else
+                    {
+                        return ConvolutionMethod::WINOGRAD;
+                    }
+                }
+
+                // Direct convolution case
+                if (is_direct_valid)
+                {
+                    if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                         get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
+                    {
+                        if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+                        {
+                            return ConvolutionMethod::DIRECT;
+                        }
+                    }
+                    else if (gpu_target == arm_compute::GPUTarget::G76)
+                    {
+                        if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+                        {
+                            return ConvolutionMethod::DIRECT;
+                        }
+                    }
+                    else
+                    {
+                        ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT;
+
+                        const bool is_indirect_valid =
+                            bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+
+                        // indirect conv2d should be called when:
+                        // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81)
+                        // 2- When the kernel size is odd
+                        // 3- When the Gpu target is Arm Mali-G77
+                        if (is_indirect_valid)
+                        {
+                            const bool is_kernel_sz_odd = kernel_sz % 2;
+                            const bool is_g77           = gpu_target == GPUTarget::G77;
+                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77
+                                                        ? ConvolutionMethod::INDIRECT
+                                                        : ConvolutionMethod::DIRECT;
+                        }
+
+                        // Direct/indirect convolution used for the first layer of the network
+                        if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
+                        {
+                            // In general, the question we should ask for the first convolution layer of a model is:
+                            // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that
+                            // when OFM is big enough, the contribution of im2col is small and the GEMM approach is preferable.
+                            // From internal experiments, the OFM threshold is 64 (is_ofm_lt_64)
+                            return preferred_conv_method;
+                        }
+
+                        if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
+                        {
+                            return preferred_conv_method;
+                        }
+
+                        // Direct convolution used for the last layer of the network
+                        if (is_ofm_lte_8)
+                        {
+                            return preferred_conv_method;
+                        }
+                    }
+                }
+
+                // Default case
+                return ConvolutionMethod::GEMM;
+            }
+
+            // Generic case for quantized. Only GeMM
+            return ConvolutionMethod::GEMM;
+        }
+    }
+}
+
+void ClConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+    _operator->run(tensors);
+}
+
+void ClConv2d::prepare(ITensorPack &tensors)
+{
+    _operator->prepare(tensors);
+}
+
+experimental::MemoryRequirements ClConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h
new file mode 100644
index 0000000000..0cf3cbc1ce
--- /dev/null
+++ b/src/gpu/cl/operators/ClConv2d.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLCONV2D_H
+#define ARM_COMPUTE_CLCONV2D_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref opencl::ClGemmConv2d
+ * -# @ref opencl::ClWinogradConv2d
+ * -# @ref opencl::ClIndirectConv2d
+ * -# @ref opencl::ClDirectConv2d
+ * -# @ref CLFFTConvolutionLayer
+ *
+ * The function selects one of the algorithms mentioned above based on:
+ *      - The size of the kernel
+ *      - Number of src/dst feature maps
+ *      - Amount of memory needed
+ *
+ * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed.
+ *
+ * FP32 Algorithm| Filter Size                                                 |   Input/Output feature maps               |
+ * --------------|-------------------------------------------------------------|-------------------------------------------|
+ * Winograd      | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7                 |  Input channels is greater than 3         |
+ * FFT           | Squared kernels and greater than 9x9                        |  Input feature maps > Output feature maps |
+ * DirectConv    | 9x9                                                         |                                           |
+ * GEMM          | Any size                                                    |                                           |
+ *
+ * Winograd 5x5 requires fast maths enabled.
+ *
+ * FP16 Algorithm| Filter Size                |   Input/Output feature maps               |
+ * --------------|----------------------------|-------------------------------------------|
+ * Winograd      | 3x3 1x3 3x1 5x1 1x5 5x5    |  Input channels is greater than 3         |
+ * FFT           | Not supported              |                                           |
+ * DirectConv    | 9x9                        |                                           |
+ * GEMM          | Any size                   |                                           |
+ *
+ * Winograd FP16 requires fast maths enabled.
+ *
+ */
+class ClConv2d : public IClOperator
+{
+public:
+    /** Default constructor */
+    ClConv2d();
+    /** Default Destructor */
+    ~ClConv2d();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClConv2d(const ClConv2d &) = delete;
+    /** Default move constructor */
+    ClConv2d(ClConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClConv2d &operator=(const ClConv2d &) = delete;
+    /** Default move assignment operator */
+    ClConv2d &operator=(ClConv2d &&) = default;
+    /** Set the src and dst tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. 3 lower dimensions represent a single src [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of srcs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  biases          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Same as @p src, except for src of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst             Destination tensor info. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+     *                             Data types supported: Same as @p src.
+     * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
+     * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d
+     *
+     * Similar to ClConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &conv2d_info,
+                           const WeightsInfo &weights_info = WeightsInfo());
+    /** Static function to check if given info will return the convolution called by @ref ClConv2d
+     *
+     * @param[in] src          Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
+     *                         while every optional dimension from 4 and above represent a batch of srcs.
+     *                         Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                         Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED.
+     * @param[in] dst          Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+     *                         Data types supported: Same as @p src.
+     * @param[in] conv2d_info  Contains convolution 2d info described in @ref Conv2dInfo.
+     * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel.
+     * @param[in] gpu_target   Specifies the @p GPUTarget.
+     *
+     * @return the Convolution Method Hint
+     */
+    static ConvolutionMethod get_convolution_method(const ITensorInfo *src,
+                                                    const ITensorInfo *weights,
+                                                    const ITensorInfo *dst,
+                                                    const Conv2dInfo  &conv2d_info,
+                                                    const WeightsInfo &weights_info,
+                                                    const GPUTarget    gpu_target);
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    std::unique_ptr<IClOperator>     _operator;
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLCONV2D_H */
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
new file mode 100644
index 0000000000..cf24c68d21
--- /dev/null
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context,
+                                               const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const TensorShape      &original_src_shape,
+                                               DataLayout              data_layout)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
+    auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
+    k->configure(compile_context, src, dst, original_src_shape, data_layout);
+    _kernel = std::move(k);
+}
+
+Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                const ITensorInfo *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
+{
+    return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
new file mode 100644
index 0000000000..c46152081c
--- /dev/null
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H
+#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClConvertFullyConnectedWeightsKernel */
+class ClConvertFullyConnectedWeights : public IClOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in] compile_context    The compile context to be used.
+     * @param[in] src                The src tensor info. Data types supported: All.
+     * @param[in] dst                The dst tensor info. Data types supported: Same as @p src
+     * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
+     * @param[in] data_layout        The data layout the weights have been trained in.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClConvertFullyConnectedWeights::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H */
diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp
new file mode 100644
index 0000000000..e2be7cebd4
--- /dev/null
+++ b/src/gpu/cl/operators/ClCopy.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCopy.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCopyKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, dst_window);
+    auto k = std::make_unique<kernels::ClCopyKernel>();
+    k->configure(compile_context, src, dst, dst_window);
+    _kernel = std::move(k);
+}
+
+Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window)
+{
+    return kernels::ClCopyKernel::validate(src, dst, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h
new file mode 100644
index 0000000000..fe9b58c607
--- /dev/null
+++ b/src/gpu/cl/operators/ClCopy.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_COPY_H
+#define ARM_COMPUTE_CL_COPY_H
+
+#include "arm_compute/core/Window.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCopyKernel */
+class ClCopy : public IClOperator
+{
+public:
+    /** Initialise the function's source and destination.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: All.
+     * @param[out] dst             Output tensor info. Data types supported: Same as @p src.
+     * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
+     *
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCopy::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_COPY_H */
diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp
new file mode 100644
index 0000000000..6313e4fbb5
--- /dev/null
+++ b/src/gpu/cl/operators/ClCrop.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClCrop.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClCropKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClCrop::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
+    auto k = std::make_unique<kernels::ClCropKernel>();
+    k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window);
+    _kernel = std::move(k);
+}
+
+Status ClCrop::validate(const ITensorInfo *src,
+                        const ITensorInfo *dst,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
+{
+    return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h
new file mode 100644
index 0000000000..e845cf372c
--- /dev/null
+++ b/src/gpu/cl/operators/ClCrop.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_CROP_H
+#define ARM_COMPUTE_CL_CROP_H
+
+#include "arm_compute/core/Window.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClCropKernel */
+class ClCrop : public IClOperator
+{
+public:
+    /** Initialise the function's source and destination.
+     *
+     * @note Supported tensor rank: up to 4
+     *
+     * @param[in]  compile_context     The compile context to be used.
+     * @param[in]  src                 Source tensor info. Data type supported: All. Data layouts supported: NHWC.
+     * @param[out] dst                 Destination tensor info. Data type supported: F32
+     * @param[in]  start               Coordinates of where to start cropping the image.
+     * @param[in]  end                 Coordinates of where to end cropping the image.
+     * @param[in]  batch_index         Fourth dimension index of the 3D image to crop in @p src.
+     * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
+     * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClCrop::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_CROP_H */
diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp
new file mode 100644
index 0000000000..eb6f9e7abb
--- /dev/null
+++ b/src/gpu/cl/operators/ClDequantize.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClDequantize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClDequantizeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClDequantizeKernel::validate(src, dst);
+}
+
+void ClDequantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDequantize.h b/src/gpu/cl/operators/ClDequantize.h
new file mode 100644
index 0000000000..ccaac2cd49
--- /dev/null
+++ b/src/gpu/cl/operators/ClDequantize.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H
+#define ARM_COMPUTE_CL_DEQUANTIZE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */
+class ClDequantize : public IClOperator
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16.
+     * @param[out] dst             Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32.
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClDequantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */
diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
new file mode 100644
index 0000000000..17a196ce6b
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2021-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+#include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h"
+#include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
+#include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
+
+using namespace arm_compute::cl_direct_conv;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace
+{
+ITensorPack select_activation_src_dst(ITensorPack &tensors)
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST));
+    pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST));
+    return pack;
+}
+
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+{
+    // Get GPU target
+    GPUTarget gpu_target = CLScheduler::get().target();
+
+    std::unique_ptr<IClDirectConvKernelConfig> t = ClDirectConvKernelConfigurationFactory::create(gpu_target);
+
+    return t->configure(src, weights, conv_info);
+}
+
+} // namespace
+
+void ClDirectConv2d::configure(const CLCompileContext    &compile_context,
+                               ITensorInfo               *src,
+                               ITensorInfo               *weights,
+                               ITensorInfo               *biases,
+                               ITensorInfo               *dst,
+                               const PadStrideInfo       &conv_info,
+                               const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
+
+    // Initialize the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
+
+    // Configure direct convolution kernel
+    const ActivationLayerInfo conv2d_act_info =
+        (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info
+                                                                                         : ActivationLayerInfo();
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc);
+    _direct_conv_kernel = std::move(k);
+
+    // Configure border handler
+    PixelValue zero_value(0.f);
+    if (is_data_type_quantized_asymmetric(src->data_type()))
+    {
+        zero_value = PixelValue(0, src->data_type(), src->quantization_info());
+    }
+    auto b = std::make_unique<CLFillBorderKernel>();
+    b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value);
+    _src_border_handler = std::move(b);
+
+    // Fused activation is currently supported for NHWC and floating point types
+    if (act_info.enabled() && !conv2d_act_info.enabled())
+    {
+        auto a = std::make_unique<kernels::ClActivationKernel>();
+        a->configure(compile_context, dst, dst, act_info);
+        _activation_kernel = std::move(a);
+    }
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
+}
+
+Status ClDirectConv2d::validate(const ITensorInfo         *src,
+                                const ITensorInfo         *weights,
+                                const ITensorInfo         *biases,
+                                const ITensorInfo         *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
+{
+    // Initialize the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
+    if (act_info.enabled())
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
+    }
+    return Status{};
+}
+
+void ClDirectConv2d::run(ITensorPack &tensors)
+{
+    // Run border handler
+    CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false);
+    // Run direct convolution
+    CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
+    // Run activation kernel
+    if (_activation_kernel)
+    {
+        auto act_pack = select_activation_src_dst(tensors);
+        CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);
+    }
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h
new file mode 100644
index 0000000000..0f18490814
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv2d.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H
+#define ARM_COMPUTE_CL_DIRECT_CONV2D_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
+ * -# @ref opencl::ClDirectConv2d
+ */
+class ClDirectConv2d : public IClOperator
+{
+public:
+    ClDirectConv2d() = default;
+    /** Set the src and dst tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of srcs.
+     *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
+     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts.
+     *                             Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     *
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel> _src_border_handler{nullptr};
+    std::unique_ptr<IClKernel> _activation_kernel{nullptr};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */
diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp
new file mode 100644
index 0000000000..b08347936b
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv3d.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClDirectConv3d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/gpu/cl/kernels/ClDirectConv3dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClDirectConv3d::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *src0,
+                               const ITensorInfo      *src1,
+                               const ITensorInfo      *src2,
+                               ITensorInfo            *dst,
+                               const Conv3dInfo       &conv3d_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src0);
+
+    // Configure direct convolution 3d kernel
+    auto k = std::make_unique<kernels::ClDirectConv3dKernel>();
+    k->configure(compile_context, src0, src1, src2, dst, conv3d_info);
+    _direct_conv3d_kernel = std::move(k);
+}
+
+Status ClDirectConv3d::validate(const ITensorInfo *src0,
+                                const ITensorInfo *src1,
+                                const ITensorInfo *src2,
+                                const ITensorInfo *dst,
+                                const Conv3dInfo  &conv3d_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info));
+    return Status{};
+}
+
+void ClDirectConv3d::run(ITensorPack &tensors)
+{
+    // Run direct convolution 3d
+    CLScheduler::get().enqueue_op(*_direct_conv3d_kernel.get(), tensors, true);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h
new file mode 100644
index 0000000000..5fb32460e2
--- /dev/null
+++ b/src/gpu/cl/operators/ClDirectConv3d.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_DIRECT_CONV3D_H
+#define ARM_COMPUTE_CL_DIRECT_CONV3D_H
+
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class CLCompileContext;
+struct Conv3dInfo;
+class IClKernel;
+
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer with 3 spatial dimensions. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::ClDirectConv3d
+ */
+class ClDirectConv3d : public IClOperator
+{
+public:
+    ClDirectConv3d() = default;
+    /** Set the src and dst tensors.
+     *
+     * Valid data layouts:
+     * - NDHWC
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     * |QASYMM8        |QASYMM8        |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src0            Source tensor. 4 lower dimensions represent a single src [IFM, width, height, depth],
+     *                             while every optional dimension from 5 and above represent a batch of srcs.
+     * @param[in]  src1            Weights tensor. Weights are 5D tensor with dimensions [OFM, IFM, kernel_w, kernel_h, kernel_d].
+     * @param[in]  src2            Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     * @param[out] dst             Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
+     * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
+     *
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
+
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClDirectConv3d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp
new file mode 100644
index 0000000000..1325371d19
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClElementwiseOperations.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClElementwiseDivision::configure(const ClCompileContext    &compile_context,
+                                      ITensorInfo               *src1,
+                                      ITensorInfo               *src2,
+                                      ITensorInfo               *dst,
+                                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwiseDivision::validate(const ITensorInfo         *src1,
+                                       const ITensorInfo         *src2,
+                                       const ITensorInfo         *dst,
+                                       const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
+}
+
+void ClElementwiseMax::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::MAX, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwiseMax::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
+}
+
+void ClElementwiseMin::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::MIN, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwiseMin::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
+}
+
+void ClElementwiseSquaredDiff::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *src1,
+                                         ITensorInfo               *src2,
+                                         ITensorInfo               *dst,
+                                         const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwiseSquaredDiff::validate(const ITensorInfo         *src1,
+                                          const ITensorInfo         *src2,
+                                          const ITensorInfo         *dst,
+                                          const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
+}
+
+void ClElementwisePower::configure(const ClCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
+    auto k = std::make_unique<kernels::ClArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::POWER, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClElementwisePower::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
+{
+    return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h
new file mode 100644
index 0000000000..de7c018d75
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseOperations.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for division
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an arithmetic division between two tensors.
+ */
+class ClElementwiseDivision : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: F16/F32.
+     * @param[in]  src2            Second source tensor info. same as @p src1.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwiseDivision::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class ClElementwiseMax : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwiseMax::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32.
+ * @note The function performs a max operation between two tensors.
+ */
+class ClElementwiseMin : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32.
+     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwiseMin::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
+ *
+ * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32.
+ * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2
+ */
+class ClElementwiseSquaredDiff : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]  src2            Second source tensor info. Data types supported: same as @p src1.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwiseSquaredDiff::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
+ *
+ * @note The tensor data type for the inputs must be F16/F32.
+ * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i])
+ */
+class ClElementwisePower : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src1            First source tensor info. Data types supported: F16/F32.
+     * @param[in]  src2            Second source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported:F16/F32.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClElementwisePower::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H */
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp
new file mode 100644
index 0000000000..914621183e
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClElementwiseUnary.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT);
+    _kernel = std::move(k);
+}
+
+Status ClRsqrt::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::RSQRT);
+}
+
+void ClExp::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::EXP);
+    _kernel = std::move(k);
+}
+
+Status ClExp::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::EXP);
+}
+
+void ClNeg::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::NEG);
+    _kernel = std::move(k);
+}
+
+Status ClNeg::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::NEG);
+}
+
+void ClSin::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::SIN);
+    _kernel = std::move(k);
+}
+
+Status ClSin::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::SIN);
+}
+
+void ClAbs::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::ABS);
+    _kernel = std::move(k);
+}
+
+Status ClAbs::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ABS);
+}
+
+void ClLog::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::LOG);
+    _kernel = std::move(k);
+}
+
+Status ClLog::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOG);
+}
+
+void ClRound::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::ROUND);
+    _kernel = std::move(k);
+}
+
+Status ClRound::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ROUND);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClElementwiseUnary.h b/src/gpu/cl/operators/ClElementwiseUnary.h
new file mode 100644
index 0000000000..a23b789ab5
--- /dev/null
+++ b/src/gpu/cl/operators/ClElementwiseUnary.h
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
+#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to perform inverse square root on an src tensor. */
+class ClRsqrt : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClRsqrt::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to perform exponential on an src tensor. */
+class ClExp : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClExp::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to negate an src tensor. */
+class ClNeg : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClNeg::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to calculate sine of an src tensor. */
+class ClSin : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClSin::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to perform elementwise log on an src tensor. */
+class ClLog : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClLog::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to get the absolute value of an src tensor. */
+class ClAbs : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClAbs::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+
+/** Basic function to get the round (to the nearest even) value of an src tensor. */
+class ClRound : public IClOperator
+{
+public:
+    /** Initialize the function
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: F16/F32.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClRound::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H */
diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp
new file mode 100644
index 0000000000..817b15ab20
--- /dev/null
+++ b/src/gpu/cl/operators/ClFill.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFill.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClFillKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFill::configure(const ClCompileContext &compile_context,
+                       ITensorInfo            *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
+{
+    ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window);
+    auto k = std::make_unique<kernels::ClFillKernel>();
+    k->configure(compile_context, tensor, constant_value, dst_window);
+    _kernel = std::move(k);
+}
+
+Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
+{
+    return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h
new file mode 100644
index 0000000000..e13862aa6b
--- /dev/null
+++ b/src/gpu/cl/operators/ClFill.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FILL_H
+#define ARM_COMPUTE_CL_FILL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClFillKernel */
+class ClFill : public IClOperator
+{
+public:
+    /** Initialise the kernel's tensor and filling value
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] tensor          Source tensor info. Supported data types: All.
+     * @param[in]     constant_value  The value used to fill the planes of the tensor
+     * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClFill::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FILL_H */
diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp
new file mode 100644
index 0000000000..7532532c94
--- /dev/null
+++ b/src/gpu/cl/operators/ClFlatten.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFlatten.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClReshapeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClReshapeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFlatten.h b/src/gpu/cl/operators/ClFlatten.h
new file mode 100644
index 0000000000..d2ce3b701d
--- /dev/null
+++ b/src/gpu/cl/operators/ClFlatten.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FLATTEN_H
+#define ARM_COMPUTE_CL_FLATTEN_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to flatten a given input */
+class ClFlatten : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src            |dst            |
+     * |:--------------|:--------------|
+     * |All            |All            |
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             Source tensor to flatten with at least 3 dimensions.
+     *                            The dimensions above the third will be interpreted as batches. Data types supported: All
+     * @param[in] dst             Destination tensor with shape [w*h*d, input_batches] where:
+     *                            w = width input tensor, h = height input tensor and d = depth input tensor.
+     *                            Data type supported: same as @p src
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClFlatten::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FLATTEN_H */
diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp
new file mode 100644
index 0000000000..6790160172
--- /dev/null
+++ b/src/gpu/cl/operators/ClFloor.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFloor.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClFloorKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClFloorKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClFloor::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClFloorKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFloor.h b/src/gpu/cl/operators/ClFloor.h
new file mode 100644
index 0000000000..746147335e
--- /dev/null
+++ b/src/gpu/cl/operators/ClFloor.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_FLOOR_H
+#define ARM_COMPUTE_CL_FLOOR_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClFloorKernel */
+class ClFloor : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             Source tensor info. Data types supported: F16/F32.
+     * @param[in] dst             Destination tensor info. Data type supported: same as @p src
+     */
+    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClFloor::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_FLOOR_H */
diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
new file mode 100644
index 0000000000..6969ac8ab3
--- /dev/null
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp
@@ -0,0 +1,698 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClFullyConnected.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
+#include "src/gpu/cl/operators/ClFlatten.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+#include "src/gpu/cl/operators/ClMatMul.h"
+#include "src/gpu/cl/operators/ClTranspose.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+#include "support/Cast.h"
+
+#include <algorithm>
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::experimental;
+using namespace arm_compute::misc::shape_calculator;
+
+namespace
+{
+// Function to calculate batched tensor shape in format [M, 1, B0, B1 ..] which is the format matmul expects
+inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src)
+{
+    return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation
+}
+
+Status construct_gemmlowp_output_stage(const ITensorInfo       &src,
+                                       const ITensorInfo       &weights,
+                                       const ITensorInfo       &dst,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                       ActivationLayerInfo      activation_info)
+{
+    gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset     = 0;
+    gemmlowp_output_stage.gemmlowp_multiplier = 0;
+    gemmlowp_output_stage.gemmlowp_shift      = 0;
+
+    const auto data_type = src.data_type();
+
+    // Configure output stage for quantized case
+    if (is_data_type_quantized_asymmetric(data_type))
+    {
+        const QuantizationInfo        oq_info = dst.quantization_info();
+        const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
+        const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform();
+        const UniformQuantizationInfo oq_unif = oq_info.uniform();
+
+        const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif;
+
+        const float multiplier        = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
+        int         output_multiplier = 0;
+        int         output_shift      = 0;
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+
+        PixelValue type_min{};
+        PixelValue type_max{};
+        std::tie(type_min, type_max) = get_min_max(data_type);
+
+        if (activation_info.enabled())
+        {
+            std::tie(type_min, type_max) =
+                get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
+        }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
+        gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier);
+        gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift);
+        type_min.get(gemmlowp_output_stage.gemmlowp_min_bound);
+        type_max.get(gemmlowp_output_stage.gemmlowp_max_bound);
+    }
+
+    return Status{};
+}
+
+Status validate_mm(const ITensorInfo             &src,
+                   const ITensorInfo             &weights,
+                   const ITensorInfo             *bias,
+                   const ITensorInfo             &dst,
+                   const FullyConnectedLayerInfo &fc_info,
+                   bool                           use_matmul)
+{
+    // Note : If input is dynamic and data is not batched, use matmul, else use gemm
+    const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+    const bool use_dynamic_gemm =
+        !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
+    const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type());
+
+    if (use_matmul)
+    {
+        const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights);
+
+        // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1]
+        TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape()));
+
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info);
+
+        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst,
+                                                                          kernel_info, fc_info.activation_info)
+                            : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info,
+                                                                      fc_info.activation_info);
+    }
+    else
+    {
+        GEMMLowpOutputStageInfo gemmlowp_output_stage;
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
+
+        const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
+                                             false,                           // is_b_reshaped
+                                             !use_dynamic_gemm,               // reshape_b_only_on_first_run
+                                             0,                               // depth_output_gemm3d
+                                             false,                           // reinterpret_input_as_3d
+                                             fc_info.retain_internal_weights, // retain_internal_weights
+                                             gemmlowp_output_stage,           // gemmlowp_output_stage
+                                             fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                             false,                           // fast_math
+                                             true,                            // broadcast_bias
+                                             ActivationLayerInfo());          // activation_info
+
+        if (is_quantized)
+        {
+            const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
+            const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
+
+            // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+            // Extract and negate src and weights offset
+            const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset);
+            const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
+
+            // Validate gemmlowp function
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(
+                &src.clone()->set_quantization_info(src_quantization_info),
+                &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info));
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info));
+        }
+    }
+
+    return Status{};
+}
+} // namespace
+
+ClFullyConnected::ClFullyConnected()
+    : _convert_weights(nullptr),
+      _flatten(nullptr),
+      _reshape_weights(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _matmul_native_kernel(nullptr),
+      _matmul_lowp_native_kernel(nullptr),
+      _aux_mem(Count)
+{
+}
+
+ClFullyConnected::~ClFullyConnected() = default;
+
+void ClFullyConnected::configure_mm(const CLCompileContext        &compile_context,
+                                    ITensorInfo                   *src,
+                                    ITensorInfo                   *weights,
+                                    ITensorInfo                   *bias,
+                                    ITensorInfo                   *dst,
+                                    const FullyConnectedLayerInfo &fc_info)
+{
+    // If weights are dynamic and matmul is supported use matmul, else use gemm
+    if (_use_matmul)
+    {
+        // Specify whether transpose weights is necessary in matmul info
+        const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights);
+
+        // Note: MatMul does not need offset negation unlike gemm
+        // 1. Change shape when calling matmul to fit batch expectations.
+        _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape()));
+
+        // 2. Use heuristics to get kernel info object
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info);
+
+        // 3. Configure relevant matmul kernel
+        if (_is_quantized)
+        {
+            _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>();
+            _matmul_lowp_native_kernel->set_target(gpu_target);
+            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                                  fc_info.activation_info);
+        }
+        else
+        {
+            _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>();
+            _matmul_native_kernel->set_target(gpu_target);
+            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                             fc_info.activation_info);
+        }
+    }
+    else
+    {
+        // Configure GEMM
+        GEMMLowpOutputStageInfo gemmlowp_output_stage;
+        construct_gemmlowp_output_stage(*src, *weights, *dst, gemmlowp_output_stage, fc_info.activation_info);
+
+        const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
+                                             false,                           // is_b_reshaped
+                                             !_dynamic_gemm,                  // reshape_b_only_on_first_run
+                                             0,                               // depth_output_gemm3d
+                                             false,                           // reinterpret_input_as_3d
+                                             fc_info.retain_internal_weights, // retain_internal_weights
+                                             gemmlowp_output_stage,           // gemmlowp_output_stage
+                                             fc_info.fp_mixed_precision,      // fp_mixed_precision
+                                             false,                           // fast_math
+                                             true,                            // broadcast_bias
+                                             fc_info.activation_info);        // activation_info
+
+        if (_is_quantized)
+        {
+            // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+            // Extract and negate input and weights offset
+            const QuantizationInfo src_quantization_info     = src->quantization_info();
+            const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+            TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
+            TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
+
+            src_info.set_quantization_info(
+                QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
+            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale,
+                                                                -weights_quantization_info.uniform().offset));
+
+            // Configure gemmlowp function
+            _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
+            _mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info);
+        }
+        else
+        {
+            // Configure matrix multiply kernel
+            _mm_gemm = std::make_unique<ClGemm>();
+            _mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info);
+        }
+    }
+}
+
+void ClFullyConnected::configure_conv_fc(const CLCompileContext        &compile_context,
+                                         ITensorInfo                   *src,
+                                         ITensorInfo                   *weights,
+                                         ITensorInfo                   *bias,
+                                         ITensorInfo                   *dst,
+                                         const FullyConnectedLayerInfo &fc_info)
+{
+    // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
+    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) !=
+                          (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+    // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
+
+    // Initialize output tensor for flatten
+    _flattened_src = src->clone()
+                         ->set_is_resizable(true)
+                         .reset_padding()
+                         .set_tensor_shape(compute_flatten_shape(src))
+                         .set_data_layout(DataLayout::NCHW);
+
+    // Configure flatten kernel
+    _flatten = std::make_unique<ClFlatten>();
+    _flatten->configure(compile_context, src, &_flattened_src);
+
+    // Note: if flatten has > 1 dimensions after, these dimensions are batch
+    // Configure matrix multiply kernel
+    configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
+}
+
+void ClFullyConnected::configure_fc_fc(const CLCompileContext        &compile_context,
+                                       ITensorInfo                   *src,
+                                       ITensorInfo                   *weights,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
+                                       const FullyConnectedLayerInfo &fc_info)
+{
+    // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
+    ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1));
+
+    // Configure matrix multiply kernel
+    configure_mm(compile_context, src, weights, bias, dst, fc_info);
+}
+
+void ClFullyConnected::configure(const CLCompileContext &compile_context,
+                                 ITensorInfo            *src,
+                                 ITensorInfo            *weights,
+                                 ITensorInfo            *biases,
+                                 ITensorInfo            *dst,
+                                 FullyConnectedLayerInfo fc_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target());
+
+    // Perform validate step
+    ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
+
+    _transpose_weights  = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+    _is_fc_after_conv   = true;
+    _is_quantized       = is_data_type_quantized_asymmetric(src->data_type());
+    _is_prepared        = fc_info.retain_internal_weights;
+    _weights_to_use     = TensorInfo(*weights);
+    _weights_to_use_idx = ACL_SRC_1;
+
+    // When using dynamic weights - use matmul kernels.
+    // Note: MatMul is not used in the following cases (Gemm is used as fallback) :
+    // 1. When the weights tensor is not dynamic
+    // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched.
+    // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required)
+    const bool is_batched_fc_layer = dst->dimension(1) > 1;
+    _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer &&
+                  !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+    _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    // Check if we have a fully connected layer with batches
+    if (is_batched_fc_layer)
+    {
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        _is_fc_after_conv = src->num_dimensions() > 1;
+    }
+
+    ITensorInfo *weights_used = weights;
+
+    // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op.
+    if (_transpose_weights && !_use_matmul)
+    {
+        // Reshape the weights
+        _reshape_weights = std::make_unique<ClTranspose>();
+        _reshape_weights->configure(compile_context, weights, &_reshaped_weights);
+        weights_used        = &_reshaped_weights;
+        _weights_to_use_idx = offset_int_vec(TransposedWeights);
+    }
+
+    // Convert weights if needed
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Convert weights
+        _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
+        _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(),
+                                    fc_info.weights_trained_layout);
+
+        weights_used         = &_converted_weights;
+        _weights_to_use_idx  = offset_int_vec(ConvertedWeights);
+        _run_convert_weights = true;
+    }
+
+    if (_is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info);
+    }
+    // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
+    _weights_to_use = *weights_used;
+
+    if (_use_matmul)
+    {
+        // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem
+        _aux_mem[ConvertedWeights] =
+            MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
+    }
+    else
+    {
+        // Set auxiliary memory requirements for gemm operators
+        auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
+        for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+        {
+            _aux_mem[i] = gemm_mem_req[i];
+        }
+        if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
+        {
+            // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+            // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
+            _aux_mem[TransposedWeights] = MemoryInfo(
+                offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                    _converted_weights.total_size());
+        }
+        else
+        {
+            // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
+            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+
+            _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                     _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
+                                                     _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
+                                                     _converted_weights.total_size());
+        }
+    }
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+}
+
+Status ClFullyConnected::validate(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  const ITensorInfo      *dst,
+                                  FullyConnectedLayerInfo fc_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target());
+
+    const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
+    bool       is_fc_after_conv  = true;
+
+    // When using dynamic weights - use matmul kernels.
+    // Note: MatMul does not support broadcasting so fallback with batched cases.
+    const bool is_batched_fc_layer = dst->dimension(1) > 1;
+    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() &&
+                            !is_batched_fc_layer &&
+                            !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+
+    const ITensorInfo &flatten_src      = TensorInfo(src->clone()
+                                                         ->set_is_resizable(true)
+                                                         .reset_padding()
+                                                         .set_tensor_shape(compute_flatten_shape(src))
+                                                         .set_data_layout(DataLayout::NCHW));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul)
+                                               ? TensorInfo(*reshaped_weights.clone())
+                                               : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
+
+    // With the Fully Connected layer we can have 4 different cases:
+    //  1) Convolution layer -> Fully Connected layer without batches
+    //  2) Fully Connected layer -> Fully Connected layer without batches
+    //  3) Convolution layer -> Fully Connected layer with batches
+    //  4) Fully Connected layer -> Fully Connected layer with batches
+
+    const ITensorInfo *src_to_use     = src;
+    const ITensorInfo *weights_to_use = weights;
+
+    if (biases != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+        if (is_data_type_quantized(src->data_type()))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+    }
+
+    // Check if FC is after conv (flatten kernel is run in case where FC is after conv.)
+    if (is_batched_fc_layer)
+    {
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
+    }
+    else
+    {
+        is_fc_after_conv = src->num_dimensions() > 1;
+    }
+
+    // Transpose kernel does not run when matmul is supported as matmul fuses transpose op.
+    if (transpose_weights && !use_matmul)
+    {
+        // Validate reshape weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
+        weights_to_use = &reshaped_weights;
+    }
+
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    {
+        // Validate convert weights kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
+        weights_to_use = &converted_weights;
+    }
+
+    if (is_fc_after_conv)
+    {
+        // Fully Connected layer after a Convolution Layer without batches
+        // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled
+        const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1;
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+
+        // Validate flatten kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
+        src_to_use = &flatten_src;
+    }
+    else
+    {
+        // Fully Connected layer after a Fully Connected Layer without batches
+        // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled
+        const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1;
+        ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(weight_idx));
+    }
+
+    // Validate matrix multiply kernel
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*src_to_use, *weights_to_use, biases, *dst, fc_info, use_matmul));
+
+    return Status{};
+}
+
+void ClFullyConnected::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    ++_asrt_run_count;
+    ARM_COMPUTE_ERROR_ON(_dynamic_gemm && _asrt_prepare_count != _asrt_run_count);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
+    auto src = tensors.get_const_tensor(ACL_SRC_0);
+
+    CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false);
+    CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
+
+    // Linearize input if it comes from a convolutional layer
+    if (_is_fc_after_conv)
+    {
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
+        _flatten->run(flatten_pack);
+    }
+
+    ITensorPack gemm_pack = tensors;
+    gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
+    if (_weights_to_use_idx != ACL_SRC_1)
+    {
+        gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
+    }
+
+    // Run MatMul Op
+    if (_use_matmul)
+    {
+        // Run matmul kernels for matrix multiplication
+        if (_is_quantized)
+        {
+            CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true);
+        }
+        else
+        {
+            CLScheduler::get().enqueue_op(*_matmul_native_kernel, gemm_pack, true);
+        }
+    }
+    else
+    {
+        // Run matrix multiply
+        if (_is_quantized)
+        {
+            _mm_gemmlowp->run(gemm_pack);
+        }
+        else
+        {
+            _mm_gemm->run(gemm_pack);
+        }
+    }
+}
+
+void ClFullyConnected::prepare(ITensorPack &tensors)
+{
+    // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed.
+    if (!_is_prepared || _dynamic_gemm)
+    {
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+        ++_asrt_prepare_count;
+        ARM_COMPUTE_ERROR_ON(!_dynamic_gemm && !_use_matmul && _asrt_prepare_count > 1);
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+
+        auto weights = tensors.get_const_tensor(ACL_SRC_1);
+
+        CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false);
+        CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false);
+
+        // Pointer to current weights
+        const ITensor *cur_weights = weights;
+
+        // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose.
+        if (_transpose_weights && !_use_matmul)
+        {
+            // Run reshape weights kernel and mark weights as unused
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
+            _reshape_weights->run(transpose_pack);
+
+            cur_weights->mark_as_unused();
+            cur_weights = reshaped_weights.get();
+        }
+
+        // Convert weights if needed
+        if (_run_convert_weights)
+        {
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
+            _convert_weights->run(convert_pack);
+
+            cur_weights->mark_as_unused();
+            cur_weights = converted_weights.get();
+        }
+
+        ITensorPack gemm_pack = tensors;
+        gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
+
+        // Prepare GEMM prepare and release unused weights
+        if (_dynamic_gemm || !_use_matmul)
+        {
+            if (!_is_quantized)
+            {
+                _mm_gemm->prepare(gemm_pack);
+            }
+            else
+            {
+                _mm_gemmlowp->prepare(gemm_pack);
+            }
+        }
+
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClFullyConnected::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
new file mode 100644
index 0000000000..72884ff7ad
--- /dev/null
+++ b/src/gpu/cl/operators/ClFullyConnected.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/FullyConnectedLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+// Forward declarations
+class ClConvertFullyConnectedWeights;
+class ClFlatten;
+class ClGemm;
+class ClGemmLowpMatrixMultiplyCore;
+class ClTranspose;
+// Kernel Forward Declarations
+namespace kernels
+{
+class ClMatMulNativeKernel;
+class ClMatMulLowpNativeKernel;
+} // namespace kernels
+/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
+ *  -# @ref opencl::ClGemm or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
+ *
+ * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
+ */
+class ClFullyConnected : public IClOperator
+{
+public:
+    ClFullyConnected();
+    ~ClFullyConnected();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2   |dst            |
+     * |:--------------|:------------------|:------|:--------------|
+     * |F16            |F16                |F16    |F16            |
+     * |F32            |F32                |F32    |F32            |
+     * |QASYMM8        |QASYMM8            |S32    |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32    |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights         Weights tensor. The weights must be 2 dimensional.
+     *                             If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions.
+     *                             If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension.
+     *                             Data type supported: Same as @p src.
+     * @param[in]  biases          Bias tensor. Can be nullptr. Data type supported:Same as @p src.
+     * @param[out] dst             Destination tensor. Its shape should be equal to the output of a matrix multiplication between:
+     *                             - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer
+     *                             - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer.
+     *                             Data type supported: Same as @p src.
+     * @param[in]  fc_info         (Optional) Fully connected layer additional info
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClFullyConnected::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
+                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
+
+    // Inherited methods overriden
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    void configure_fc_fc(const CLCompileContext        &compile_context,
+                         ITensorInfo                   *src,
+                         ITensorInfo                   *weights,
+                         ITensorInfo                   *bias,
+                         ITensorInfo                   *dst,
+                         const FullyConnectedLayerInfo &fc_info);
+    void configure_conv_fc(const CLCompileContext        &compile_context,
+                           ITensorInfo                   *src,
+                           ITensorInfo                   *weights,
+                           ITensorInfo                   *bias,
+                           ITensorInfo                   *dst,
+                           const FullyConnectedLayerInfo &fc_info);
+    void configure_mm(const CLCompileContext        &compile_context,
+                      ITensorInfo                   *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *bias,
+                      ITensorInfo                   *dst,
+                      const FullyConnectedLayerInfo &fc_info);
+
+private:
+    enum AuxTensorIdx
+    {
+        TransposedWeights = 10,
+        ConvertedWeights  = 11,
+        FlattenedSrc      = 12,
+        Count             = 13
+    };
+
+    std::unique_ptr<ClConvertFullyConnectedWeights> _convert_weights;
+    std::unique_ptr<ClFlatten>                      _flatten;
+    std::unique_ptr<ClTranspose>                    _reshape_weights;
+    std::unique_ptr<ClGemm>                         _mm_gemm;
+    std::unique_ptr<ClGemmLowpMatrixMultiplyCore>   _mm_gemmlowp;
+
+    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel;
+    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel;
+
+    experimental::MemoryRequirements _aux_mem{};
+
+    TensorInfo _flattened_src{};
+    TensorInfo _converted_weights{};
+    TensorInfo _reshaped_weights{};
+    TensorInfo _lhs_to_use{};
+    TensorInfo _weights_to_use{};
+    int        _weights_to_use_idx{ACL_SRC_1};
+
+    bool _run_convert_weights{false};
+    bool _transpose_weights{false};
+    bool _dynamic_gemm{false};
+    bool _use_matmul{false};
+
+    bool _is_fc_after_conv{true};
+    bool _is_quantized{false};
+    bool _is_prepared{false};
+
+#ifdef ARM_COMPUTE_ASSERTS_ENABLED
+    int _asrt_run_count{};
+    int _asrt_prepare_count{};
+#endif // ARM_COMPUTE_ASSERTS_ENABLED
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLFULLYCONNECTED_H
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
new file mode 100644
index 0000000000..815c254c69
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -0,0 +1,923 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemm.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/GPUTarget.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/ITensorAllocator.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/utils/helpers/float_ops.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
+#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
+#include "support/Cast.h"
+#include "utils/TypePrinter.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
+using namespace arm_compute::experimental;
+using namespace arm_compute::utils::cast;
+using namespace arm_compute::opencl::kernels;
+
+namespace
+{
+inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
+{
+    return kernel_type == CLGEMMKernelType::NATIVE ? false : true;
+}
+//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
+inline CLGEMMKernelType
+auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
+{
+    if (!constant_weights)
+    {
+        return CLGEMMKernelType::NATIVE;
+    }
+
+    auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
+    if (bool(gemm_kernel))
+    {
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
+            return gemm_kernel.gemm_type;
+        }
+    }
+    gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
+    return gemm_kernel.gemm_type;
+}
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *c,
+                                                    const ITensorInfo       *output,
+                                                    GEMMKernelInfo           gemm_kernel_info)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo tmp_b_info{};
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    gemm_kernel_info.lhs_info  = lhs_info;
+    gemm_kernel_info.rhs_info  = rhs_info;
+    gemm_kernel_info.has_pad_y = false;
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
+    {
+        return false;
+    }
+    gemm_kernel_info.has_pad_y = true;
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          GEMMKernelInfo               kernel_info,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *c,
+                                          const ITensorInfo           *output)
+{
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
+    if (config)
+    {
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
+        }
+    }
+    config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+
+// Validate lhs_info and rhs_info for reshaped kernel
+inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info,
+                                           const GEMMRHSMatrixInfo &rhs_info,
+                                           const ITensorInfo       *a,
+                                           const ITensorInfo       *b,
+                                           const ITensorInfo       *c,
+                                           const ITensorInfo       *output,
+                                           GEMMKernelInfo           gemm_kernel_info,
+                                           bool                     reinterpret_input_as_3d)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    // Validate reshape LHS kernel
+    auto_init_if_empty(tmp_a_info,
+                       a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
+    if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
+    {
+        return false;
+    }
+
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    gemm_kernel_info.lhs_info = lhs_info;
+    gemm_kernel_info.rhs_info = rhs_info;
+    if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                           rhs_info, gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query,
+                                 GEMMKernelInfo               kernel_info,
+                                 const ITensorInfo           *a,
+                                 const ITensorInfo           *b,
+                                 const ITensorInfo           *c,
+                                 const ITensorInfo           *output,
+                                 bool                         reinterpret_input_as_3d)
+{
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
+    if (config)
+    {
+        if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info,
+                                           reinterpret_input_as_3d))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
+        }
+    }
+    config = auto_heuristics::select_default_gemm_config_reshaped(query);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+        to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+} // namespace
+
+ClGemm::ClGemm()
+    : _reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()),
+      _reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
+      _mm_native_kernel(std::make_unique<ClGemmMatrixMultiplyNativeKernel>()),
+      _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),
+      _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
+      _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
+      _tmp_a(),
+      _tmp_b(),
+      _reshape_b_only_on_first_run(false),
+      _gemm_kernel_type(CLGEMMKernelType::NATIVE),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
+{
+}
+
+void ClGemm::configure_native(const CLCompileContext &compile_context,
+                              ITensorInfo            *a,
+                              ITensorInfo            *b,
+                              ITensorInfo            *c,
+                              ITensorInfo            *output,
+                              float                   alpha,
+                              float                   beta,
+                              const GEMMInfo         &gemm_info)
+{
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    // Set the target for the kernels
+    _mm_native_kernel->set_target(gpu_target);
+
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+
+    // Configure and tune matrix multiply kernel
+    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info,
+                                 kernel_info);
+}
+
+void ClGemm::configure_reshaped(const CLCompileContext &compile_context,
+                                ITensorInfo            *a,
+                                ITensorInfo            *b,
+                                ITensorInfo            *c,
+                                ITensorInfo            *output,
+                                float                   alpha,
+                                float                   beta,
+                                const GEMMInfo         &gemm_info)
+{
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = false;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    // Set the target for the kernels
+    _reshape_lhs_kernel->set_target(gpu_target);
+    _mm_reshaped_kernel->set_target(gpu_target);
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::tie(lhs_info, rhs_info) =
+        auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size},
+                                         kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d());
+
+    _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
+    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+    // Configure and tune matrix multiply kernel
+    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                   kernel_info);
+
+    // Request memory for LHS and RHS reshape matrix
+    _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                         ITensorInfo            *a,
+                                         ITensorInfo            *b,
+                                         ITensorInfo            *c,
+                                         ITensorInfo            *output,
+                                         float                   alpha,
+                                         float                   beta,
+                                         const GEMMInfo         &gemm_info)
+{
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    // Set the target for the kernels
+    _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output);
+
+    // Transpose matrix
+    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+    // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true)
+    // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have
+    // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false
+
+    // Configure matrix multiply kernel with no y padding support
+    kernel_info.has_pad_y = false;
+    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                            kernel_info);
+
+    // Request memory for RHS reshape matrix
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                              ITensorInfo            *a,
+                                              ITensorInfo            *b,
+                                              ITensorInfo            *c,
+                                              ITensorInfo            *output,
+                                              float                   alpha,
+                                              float                   beta,
+                                              const GEMMInfo         &gemm_info)
+{
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    // Set the target for the kernels
+    _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
+
+    GEMMLHSMatrixInfo lhs_info{};
+    GEMMRHSMatrixInfo rhs_info{};
+
+    // Pick up the GEMM configuration
+    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
+    // Force H0 to 4 in order to use the MMUL extension
+    rhs_info.h0 = 4;
+
+    // Reshape Rhs matrix
+    _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
+
+    // Configure matrix multiply kernel with no y padding support
+    kernel_info.has_pad_y = false;
+    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info,
+                                                 rhs_info, kernel_info);
+
+    // Request memory for RHS reshape matrix
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+}
+
+Status ClGemm::validate_native(const ITensorInfo *a,
+                               const ITensorInfo *b,
+                               const ITensorInfo *c,
+                               const ITensorInfo *output,
+                               float              alpha,
+                               float              beta,
+                               const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(
+        a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
+
+    return Status{};
+}
+
+Status ClGemm::validate_reshaped(const ITensorInfo *a,
+                                 const ITensorInfo *b,
+                                 const ITensorInfo *c,
+                                 const ITensorInfo *output,
+                                 float              alpha,
+                                 float              beta,
+                                 const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_a_info{};
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    DataType           data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = false;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+    const auto gemm_config =
+        select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
+
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(
+                                       compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha,
+                                                                             beta, lhs_info, rhs_info, kernel_info));
+
+    return Status{};
+}
+
+Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a,
+                                          const ITensorInfo *b,
+                                          const ITensorInfo *c,
+                                          const ITensorInfo *output,
+                                          float              alpha,
+                                          float              beta,
+                                          const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    const DataType     data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    kernel_info.has_pad_y = false;
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+    kernel_info.has_pad_y = true;
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+    return Status{};
+}
+
+Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               float              alpha,
+                                               float              beta,
+                                               const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_UNUSED(alpha);
+    ARM_COMPUTE_UNUSED(output);
+    TensorInfo tmp_b_info{};
+
+    // Get the GPU target
+    const GPUTarget    gpu_target              = CLScheduler::get().target();
+    const DataType     data_type               = a->data_type();
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
+
+    GEMMKernelInfo kernel_info;
+    kernel_info.m                       = m;
+    kernel_info.n                       = n;
+    kernel_info.k                       = k;
+    kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    kernel_info.broadcast_bias          = broadcast_bias;
+    kernel_info.activation_info         = gemm_info.activation_info();
+
+    GEMMLHSMatrixInfo lhs_info;
+    GEMMRHSMatrixInfo rhs_info;
+
+    // Pick up the GEMM configuration
+    // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
+    // Force H0 to 4 in order to use the MMUL extension
+    rhs_info.h0 = 4;
+
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
+
+    // Validate matrix multiply
+    kernel_info.has_pad_y = false;
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+
+    return Status{};
+}
+
+void ClGemm::configure(const CLCompileContext &compile_context,
+                       ITensorInfo            *a,
+                       ITensorInfo            *b,
+                       ITensorInfo            *c,
+                       ITensorInfo            *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info));
+    ARM_COMPUTE_LOG_PARAMS(a, b, c, output, alpha, beta, gemm_info);
+
+    // Check if we need to reshape the matrix B only on the first run
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _is_prepared                 = gemm_info.retain_internal_weights();
+
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+
+    // Select GEMMType
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size},
+        _reshape_b_only_on_first_run, b->are_values_constant());
+
+    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+    ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+
+    switch (_gemm_kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+        {
+            configure_native(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED:
+        {
+            configure_reshaped(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        {
+            configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+        {
+            configure_reshaped_only_rhs_mmul(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info);
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
+    }
+}
+
+Status ClGemm::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
+{
+    // Get the GPU target
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+
+    // Check data type early because the auto_select_gemm_kernel has assertions on supported data types
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
+
+    // Select GEMMType
+    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{
+            CLScheduler::get().target(),
+            a->data_type(),
+            m,
+            n,
+            k,
+            batch_size,
+        },
+        gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
+
+    const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
+
+    const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
+
+    switch (gemm_kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_native(a, b, c_to_use, output, alpha, beta, gemm_info));
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info));
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info));
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported");
+        }
+    }
+
+    return Status{};
+}
+
+void ClGemm::run(ITensorPack &tensors)
+{
+    const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0);
+    const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1);
+    ITensor       *dst = tensors.get_tensor(ACL_DST);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst);
+
+    CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true);
+    CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+
+    // Prepare the consts if needed
+    prepare(tensors);
+
+    // Run matrix multiply kernel
+    switch (_gemm_kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+        {
+            CLScheduler::get().enqueue_op(*_mm_native_kernel, tensors, true);
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED:
+        {
+            // Run interleave kernel
+            ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}};
+            CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
+
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
+                CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
+            }
+            // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts
+            ITensorPack gemm_reshaped_pack(tensors);
+            gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get());
+            gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
+
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
+            {
+                CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
+            }
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
+                CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
+            }
+            // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
+            // Check if the lhs or dst tensors have padding
+            const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;
+            const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;
+            bool               has_pad_y           = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
+
+            // Copy original tensor pack and overwrite rhs with reshaped counterpart
+            ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
+            gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
+
+            if (has_pad_y)
+            {
+                ARM_COMPUTE_ERROR_ON(has_pad_y);
+            }
+            else
+            {
+                CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true);
+            }
+            break;
+        }
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose kernel
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
+                CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
+            }
+            // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
+            // Check if the lhs or dst tensors have padding
+            const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom;
+            const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom;
+            bool               has_pad_y           = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0);
+
+            // Copy original tensor pack and overwrite rhs with reshaped counterpart
+            ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
+            gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
+
+            if (has_pad_y)
+            {
+                ARM_COMPUTE_ERROR_ON(has_pad_y);
+            }
+            else
+            {
+                CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_onlyrhs_pack, true);
+            }
+            break;
+        }
+        default:
+        {
+            ARM_COMPUTE_ERROR("GEMMType not supported");
+        }
+    }
+}
+
+void ClGemm::prepare(ITensorPack &constants)
+{
+    if (!_is_prepared)
+    {
+        const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);
+        ICLTensor     *rhs_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
+
+        // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
+        if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) &&
+            (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
+        {
+            ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
+
+            CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
+            ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
+
+            ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}};
+            CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
+        }
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClGemm::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
new file mode 100644
index 0000000000..85dc1d6c8f
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemm.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMM_H
+#define ARM_COMPUTE_CL_GEMM_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/GEMMInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref kernels::ClGemmReshapeLhsMatrixKernel (only if the RESHAPED is selected by the heuristic model)
+ *  -# @ref kernels::ClGemmReshapeRhsMatrixKernel (only if either the RESHAPED or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
+ *  -# @ref kernels::ClGemmMatrixMultiplyNativeKernel (only if NATIVE is selected by the select_gemm_kernel method())
+ *  -# @ref kernels::ClGemmMatrixMultiplyReshapedKernel (only if RESHAPED is selected by the select_gemm_kernel method())
+ *  -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
+ *  -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel (only if RESHAPED_ONLY_RHS_MMUL is selected by the select_gemm_kernel method())
+ */
+class ClGemm : public IClOperator
+{
+public:
+    /** Constructor */
+    ClGemm();
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
+     *
+     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
+     *
+     * @note All tensors must have the same data type.
+     *
+     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
+     *
+     * @note Batched GEMM only allows RHS tensor's rank to be <= 3
+     * @note Batched GEMM only supports broadcasting cases where RHS rank < LHS rank but not the other way around
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  a               First input tensor  (Matrix or Vector A). Data types supported: F16/F32
+     * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a.
+     * @param[in]  c               Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
+     * @param[out] output          Output tensor. Data type supported: same as @p a
+     * @param[in]  alpha           Weight of the matrix product
+     * @param[in]  beta            Weight of matrix C
+     * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
+     *                             in case matrix A and matrix B have been already transformed.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClGemm::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info);
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    void configure_native(const CLCompileContext &compile_context,
+                          ITensorInfo            *a,
+                          ITensorInfo            *b,
+                          ITensorInfo            *c,
+                          ITensorInfo            *output,
+                          float                   alpha,
+                          float                   beta,
+                          const GEMMInfo         &gemm_info);
+    void configure_reshaped(const CLCompileContext &compile_context,
+                            ITensorInfo            *a,
+                            ITensorInfo            *b,
+                            ITensorInfo            *c,
+                            ITensorInfo            *output,
+                            float                   alpha,
+                            float                   beta,
+                            const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                     ITensorInfo            *a,
+                                     ITensorInfo            *b,
+                                     ITensorInfo            *c,
+                                     ITensorInfo            *output,
+                                     float                   alpha,
+                                     float                   beta,
+                                     const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                          ITensorInfo            *a,
+                                          ITensorInfo            *b,
+                                          ITensorInfo            *c,
+                                          ITensorInfo            *output,
+                                          float                   alpha,
+                                          float                   beta,
+                                          const GEMMInfo         &gemm_info);
+
+    static Status validate_native(const ITensorInfo *a,
+                                  const ITensorInfo *b,
+                                  const ITensorInfo *c,
+                                  const ITensorInfo *output,
+                                  float              alpha,
+                                  float              beta,
+                                  const GEMMInfo    &gemm_info);
+    static Status validate_reshaped(const ITensorInfo *a,
+                                    const ITensorInfo *b,
+                                    const ITensorInfo *c,
+                                    const ITensorInfo *output,
+                                    float              alpha,
+                                    float              beta,
+                                    const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs(const ITensorInfo *a,
+                                             const ITensorInfo *b,
+                                             const ITensorInfo *c,
+                                             const ITensorInfo *output,
+                                             float              alpha,
+                                             float              beta,
+                                             const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                                  const ITensorInfo *b,
+                                                  const ITensorInfo *c,
+                                                  const ITensorInfo *output,
+                                                  float              alpha,
+                                                  float              beta,
+                                                  const GEMMInfo    &gemm_info);
+
+private:
+    enum AuxTensorIdx
+    {
+        LhsReshape = 0,
+        RhsReshape,
+        Count
+    };
+
+private:
+    std::unique_ptr<kernels::ClGemmReshapeLhsMatrixKernel>                  _reshape_lhs_kernel;
+    std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel>                  _reshape_rhs_kernel;
+    std::unique_ptr<kernels::ClGemmMatrixMultiplyNativeKernel>              _mm_native_kernel;
+    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel>            _mm_reshaped_kernel;
+    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel>     _mm_reshaped_only_rhs_kernel;
+    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel> _mm_reshaped_only_rhs_mmul_kernel;
+    TensorInfo                                                              _tmp_a;
+    TensorInfo                                                              _tmp_b;
+    bool                                                                    _reshape_b_only_on_first_run;
+    CLGEMMKernelType                                                        _gemm_kernel_type;
+    bool                                                                    _is_prepared;
+    experimental::MemoryRequirements                                        _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLGEMM_H */
diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
new file mode 100644
index 0000000000..55d815a1ef
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp
@@ -0,0 +1,668 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmConv2d.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
+#include "src/gpu/cl/kernels/ClCol2ImKernel.h"
+#include "src/gpu/cl/kernels/ClIm2ColKernel.h"
+#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "support/Cast.h"
+
+namespace arm_compute
+{
+using namespace experimental;
+using namespace misc::shape_calculator;
+using namespace utils::cast;
+namespace opencl
+{
+ClGemmConv2d::ClGemmConv2d()
+    : _weights_reshape_kernel(nullptr),
+      _im2col_kernel(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _col2im_kernel(nullptr),
+      _activation_kernel(nullptr),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _fuse_activation(true),
+      _append_bias(false),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
+{
+}
+ClGemmConv2d::~ClGemmConv2d() = default;
+
+void ClGemmConv2d::configure_mm(const ClCompileContext        &compile_context,
+                                const ITensorInfo             *src,
+                                ITensorInfo                   *weights,
+                                ITensorInfo                   *biases,
+                                ITensorInfo                   *dst,
+                                const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                int                            gemm_3d_depth,
+                                const ActivationLayerInfo     &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
+
+    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
+                                         false,                 // is_b_reshaped
+                                         true,                  // reshape_b_only_on_first_run
+                                         gemm_3d_depth,         // depth_output_gemm3d
+                                         _skip_im2col,          // reinterpret_input_as_3d
+                                         false,                 // retain_internal_weights
+                                         gemmlowp_output_stage, // gemmlowp_output_stage
+                                         false,                 // fast_math
+                                         false,                 // fp_mixed_precision
+                                         true,                  // broadcast_bias
+                                         act_info               // activation_info
+    );
+
+    TensorInfo tmp_src{*src};
+    if (_is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = src->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+        tmp_src.set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+        _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
+        _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info);
+
+        // Revert back QuantizatioInfo as weights could be used in other convolution layers
+        weights->set_quantization_info(weights_quantization_info);
+
+        auto mm_mem_req = _mm_gemmlowp->workspace();
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        {
+            _aux_mem[cont] = mm_mem_req[cont];
+        }
+    }
+    else
+    {
+        // Configure matrix multiply function
+        _mm_gemm = std::make_unique<ClGemm>();
+        _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
+        auto mm_mem_req = _mm_gemm->workspace();
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        {
+            _aux_mem[cont] = mm_mem_req[cont];
+        }
+    }
+}
+
+Status ClGemmConv2d::validate_mm(const ITensorInfo             *src,
+                                 const ITensorInfo             *weights,
+                                 const ITensorInfo             *biases,
+                                 const ITensorInfo             *dst,
+                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                 int                            gemm_3d_depth,
+                                 bool                           skip_im2col,
+                                 const ActivationLayerInfo     &act_info)
+{
+    const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+
+    const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
+                                         false,                 // is_b_reshaped
+                                         true,                  // reshape_b_only_on_first_run
+                                         gemm_3d_depth,         // depth_output_gemm3d
+                                         skip_im2col,           // reinterpret_input_as_3d
+                                         false,                 // retain_internal_weights
+                                         gemmlowp_output_stage, // gemmlowp_output_stage
+                                         false,                 // fast_math
+                                         false,                 // fp_mixed_precision
+                                         true,                  // broadcast_bias
+                                         act_info               // activation_info
+    );
+
+    if (is_quantized)
+    {
+        // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+        // Extract and negate input and weights offset
+        const QuantizationInfo input_quantization_info   = src->quantization_info();
+        const QuantizationInfo weights_quantization_info = weights->quantization_info();
+
+        std::unique_ptr<ITensorInfo> src_qa     = src->clone();
+        std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+        src_qa->set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights_qa->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+
+        // Perform validation step on GEMMLowp
+        return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info);
+    }
+    else
+    {
+        // Perform validation step on Matrix multiply function
+        return ClGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
+    }
+}
+
+void ClGemmConv2d::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *src,
+                             ITensorInfo            *weights,
+                             ITensorInfo            *biases,
+                             ITensorInfo            *dst,
+                             const Conv2dInfo       &conv2d_info,
+                             const WeightsInfo      &weights_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
+
+    const DataType   data_type   = src->data_type();
+    const DataLayout data_layout = src->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+    const unsigned int num_kernels   = weights->dimension(idx_kernels);
+
+    const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+    _is_prepared  = weights_info.retain_internal_weights();
+    _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    _skip_col2im  = data_layout == DataLayout::NHWC;
+
+    // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
+    _fuse_activation = true;
+
+    const ITensorInfo *gemm_input_to_use  = src;
+    ITensorInfo       *gemm_output_to_use = dst;
+
+    // Get parameters from conv_info
+    unsigned int stride_x        = 0;
+    unsigned int stride_y        = 0;
+    std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride();
+
+    // Get convolved dimensions
+    unsigned int conv_w      = 0;
+    unsigned int conv_h      = 0;
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
+
+    unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
+
+    ITensorInfo *biases_to_use = biases;
+    _append_bias               = false;
+
+    _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>();
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
+    {
+        // num_groups != 1 can only be for NCHW
+        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+        biases_to_use = nullptr;
+        _append_bias  = true;
+        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped,
+                                           conv2d_info.num_groups);
+    }
+    else
+    {
+        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped,
+                                           conv2d_info.num_groups);
+    }
+
+    // Create tensor to store im2col reshaped inputs
+    if (!_skip_im2col)
+    {
+        // Configure and tune im2col. im2col output shape is auto-initialized
+        _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>();
+
+        // Set the GPU target for im2col
+        _im2col_kernel->set_target(CLScheduler::get().target());
+        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height),
+                                  conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
+
+        // Set quantization info
+        _im2col_output.set_quantization_info(src->quantization_info());
+        CLScheduler::get().tune_kernel_static(*_im2col_kernel);
+
+        // Update GEMM input
+        gemm_input_to_use = &_im2col_output;
+    }
+
+    // Create GEMM output tensor
+    if (!_skip_col2im)
+    {
+        TensorShape shape_gemm;
+
+        // If we cannot skip col2im it means we run im2col as well
+        shape_gemm = _im2col_output.tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+
+        _gemm_output = TensorInfo(shape_gemm, 1, data_type);
+        _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+
+        // Update GEMM output
+        gemm_output_to_use = &_gemm_output;
+    }
+
+    GEMMLowpOutputStageInfo gemmlowp_output_stage;
+    gemmlowp_output_stage.type            = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset = 0;
+
+    // Configure output stage for quantized case
+    if (_is_quantized)
+    {
+        const auto         output_quant_info        = (dst->total_size() == 0) ? iq_info : oq_info;
+        const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
+        const unsigned int num_filters              = (is_quantized_per_channel) ? num_kernels : 1;
+
+        gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
+
+        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+        quantization::compute_quantized_multipliers_and_shifts(src, weights, dst,
+                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
+                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
+        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
+
+        PixelValue min_val{};
+        PixelValue max_val{};
+        std::tie(min_val, max_val) = get_min_max(dst->data_type());
+
+        auto min_activation = min_val.get<int32_t>();
+        auto max_activation = max_val.get<int32_t>();
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
+        if (conv2d_info.act_info.enabled())
+        {
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            {
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+            }
+            else
+            {
+                _fuse_activation = false;
+            }
+        }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
+    }
+
+    // Configure and tune GEMM
+    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use,
+                 gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
+
+    if (!_skip_col2im)
+    {
+        // Set the GPU target for col2im
+        _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
+        _col2im_kernel->set_target(CLScheduler::get().target());
+        // Configure and tune Col2Im
+        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h),
+                                  conv2d_info.num_groups);
+        CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
+    }
+
+    ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
+                             "Output shape does not match the expected one");
+
+    if (!_fuse_activation)
+    {
+        _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
+        _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
+    }
+
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] =
+        MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+}
+
+Status ClGemmConv2d::validate(const ITensorInfo *src,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *dst,
+                              const Conv2dInfo  &conv2d_info,
+                              const WeightsInfo &weights_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
+
+    if (!is_quantized_per_channel)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8),
+                                    "Grouping (num_groups != 1) is not supported with QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) &&
+                                (src->data_layout() == DataLayout::NCHW));
+
+    const DataLayout data_layout = src->data_layout();
+    const DataType   data_type   = src->data_type();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    const int        idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+    const unsigned int kernel_width  = weights->dimension(idx_width);
+    const unsigned int kernel_height = weights->dimension(idx_height);
+    const unsigned int num_kernels   = weights->dimension(idx_kernels);
+
+    TensorInfo         im2col_reshaped_info{};
+    TensorInfo         info_gemm{};
+    TensorInfo         weights_reshaped_info{};
+    const ITensorInfo *gemm_input_to_use  = src;
+    const ITensorInfo *gemm_output_to_use = dst;
+    const ITensorInfo *weights_to_use     = weights;
+    const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
+    const bool         skip_im2col     = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    const bool         skip_col2im     = data_layout == DataLayout::NHWC;
+    bool               fuse_activation = true;
+
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) !=
+                                src->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+    // Validate biases
+    if (biases != nullptr)
+    {
+        if (is_quantized)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    if (conv2d_info.act_info.enabled())
+    {
+        ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a());
+    }
+
+    // Get convolved dimensions
+    unsigned int conv_w = 0;
+    unsigned int conv_h = 0;
+
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
+
+    unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
+
+    const ITensorInfo *biases_to_use = biases;
+    bool               append_bias   = false;
+
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
+    {
+        // num_groups != 1 can only be for NCHW
+        // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
+        biases_to_use = nullptr;
+        append_bias   = true;
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
+    }
+    else
+    {
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
+    }
+
+    weights_to_use = &weights_reshaped_info;
+
+    if (!skip_im2col)
+    {
+        const Size2D kernel_dims(kernel_width, kernel_height);
+
+        // Output tensor auto initialization if not yet initialized
+        TensorShape expected_output_shape =
+            compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation,
+                                      conv2d_info.num_groups == 1, conv2d_info.num_groups);
+
+        auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape));
+
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info,
+                                                      append_bias, conv2d_info.dilation, conv2d_info.num_groups));
+        gemm_input_to_use = &im2col_reshaped_info;
+    }
+
+    // Create GEMM output tensor
+    if (!skip_col2im)
+    {
+        TensorShape shape_gemm;
+
+        shape_gemm = gemm_input_to_use->tensor_shape();
+        shape_gemm.set(0, mat_weights_cols);
+        shape_gemm.set(1, conv_w * conv_h);
+
+        info_gemm = TensorInfo(shape_gemm, 1, data_type);
+        info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+        gemm_output_to_use = &info_gemm;
+    }
+
+    GEMMLowpOutputStageInfo gemmlowp_output_stage;
+    gemmlowp_output_stage.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    gemmlowp_output_stage.gemmlowp_offset          = 0;
+    gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
+
+    if (is_quantized)
+    {
+        const UniformQuantizationInfo iq_info           = src->quantization_info().uniform();
+        const UniformQuantizationInfo oq_info           = dst->quantization_info().uniform();
+        const auto                    output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info;
+        const unsigned int            num_filters       = (is_quantized_per_channel) ? num_kernels : 1;
+
+        gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters);
+        gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters);
+        quantization::compute_quantized_multipliers_and_shifts(src, weights, dst,
+                                                               gemmlowp_output_stage.gemmlowp_multipliers.data(),
+                                                               gemmlowp_output_stage.gemmlowp_shifts.data());
+        gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0];
+        gemmlowp_output_stage.gemmlowp_shift      = gemmlowp_output_stage.gemmlowp_shifts[0];
+
+        int min_activation = 0;
+        int max_activation = 0;
+
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+
+        if (conv2d_info.act_info.enabled())
+        {
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            {
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+            }
+            else
+            {
+                fuse_activation = false;
+            }
+        }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset    = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_min_bound = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound = max_activation;
+    }
+
+    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use,
+                                            gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
+
+    // Validate Col2Im
+    if (!skip_col2im)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
+    }
+
+    // Validate Activation Layer
+    if (!fuse_activation)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
+    }
+
+    return Status{};
+}
+
+void ClGemmConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+    auto src                = tensors.get_const_tensor(ACL_SRC_0);
+    auto biases             = tensors.get_const_tensor(ACL_SRC_2);
+    auto dst                = tensors.get_tensor(ACL_DST);
+    auto gemm_input_to_use  = src;
+    auto gemm_output_to_use = dst;
+
+    CLAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
+    CLAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
+    CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
+
+    // Run im2col
+    if (!_skip_im2col)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
+        CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false);
+        gemm_input_to_use = im2col_output.get();
+    }
+    if (!_skip_col2im)
+    {
+        gemm_output_to_use = gemm_output.get();
+    }
+    ITensorPack pack_mm = tensors;
+    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
+    pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+    if (!_append_bias)
+    {
+        pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases);
+    }
+    pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
+    // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions
+    if (_is_quantized)
+    {
+        // Run gemmlowp
+        _mm_gemmlowp->run(pack_mm);
+    }
+    else
+    {
+        // Run gemm
+        _mm_gemm->run(pack_mm);
+    }
+
+    // Reshape output matrix
+    if (!_skip_col2im)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
+        CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false);
+    }
+
+    //Run Activation Layer if we cannot fuse in GEMM
+    if (!_fuse_activation)
+    {
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
+        CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false);
+    }
+}
+
+void ClGemmConv2d::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        // Run weights reshaping and mark original weights tensor as unused
+        ICLTensor *weights_reshaped_p =
+            utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+        CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
+        auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        ITensorPack        pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
+
+        if (_append_bias)
+        {
+            const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+            pack.add_const_tensor(TensorType::ACL_BIAS, biases);
+        }
+        CLScheduler::get().enqueue_op(*_weights_reshape_kernel.get(), pack, true);
+        tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+
+        // Prepare GEMM
+        _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
+        _is_prepared = true;
+    }
+}
+experimental::MemoryRequirements ClGemmConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
new file mode 100644
index 0000000000..e8f3147ac3
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmConv2d.h
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+class ClGemm;
+class ClGemmLowpMatrixMultiplyCore;
+namespace kernels
+{
+class ClIm2ColKernel;
+class ClCol2ImKernel;
+class ClWeightsReshapeKernel;
+class ClActivationKernel;
+} // namespace kernels
+
+/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref opencl::kernels::ClIm2ColKernel
+ * -# @ref ClGemm (if the data type is FP32 or FP16)
+ * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref ClGemmLowpOutputStage with QUANTIZE_DOWN_FIXEDPOINT type of quantization (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref opencl::kernels::ClCol2ImKernel (if NCHW data layout)
+ * -# @ref opencl::kernels::ClActivationKernel
+ */
+class ClGemmConv2d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClGemmConv2d();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClGemmConv2d(const ClGemmConv2d &) = delete;
+    /** Default move constructor */
+    ClGemmConv2d(ClGemmConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClGemmConv2d &operator=(const ClGemmConv2d &) = delete;
+    /** Default move assignment operator */
+    ClGemmConv2d &operator=(ClGemmConv2d &&) = default;
+    /**Default destructor */
+    ~ClGemmConv2d();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |F16            |F16                |F16      |F16            |
+     * |F32            |F32                |F32      |F32            |
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                             while every optional dimension from 4 and above represent a batch of inputs.
+     *                             Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  weights         Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+     * @param[in]  biases          Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+     * @param[out] dst             Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                             Data types supported: Same as @p input.
+     * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
+     * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     *                             tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClGemmConvolution::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &conv2d_info,
+                           const WeightsInfo &weights_info = WeightsInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    /** Configures the appropriate matrix multiply routine
+     *
+     * @param[in]      compile_context       The compile context to be used.
+     * @param[in]      src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]      weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
+     *                                       QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+     * @param[in]      biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                                       Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+     * @param[in, out] dst                   Output tensor info. Data types supported: same as @p input.
+     * @param[in]      gemmlowp_output_stage GEMMLowp output stage info
+     * @param[in]      gemm_3d_depth         Depth of GEMM 3D
+     * @param[in]      act_info              Activation to apply after the matrix multiplication
+     */
+    void configure_mm(const CLCompileContext        &compile_context,
+                      const ITensorInfo             *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *biases,
+                      ITensorInfo                   *dst,
+                      const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                      int                            gemm_3d_depth,
+                      const ActivationLayerInfo     &act_info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
+     *
+     * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in] weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or
+     *                                  QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED.
+     * @param[in] biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                                  Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type.
+     * @param[in] dst                   Output tensor info. Data types supported: same as @p input.
+     * @param[in] gemmlowp_output_stage GEMMLowp output stage info
+     * @param[in] gemm_3d_depth         Depth of GEMM 3D
+     * @param[in] skip_im2col           Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout.
+     * @param[in] act_info              Activation to apply after the matrix multiplication
+     *
+     * @return a status
+     */
+    static Status validate_mm(const ITensorInfo             *src,
+                              const ITensorInfo             *weights,
+                              const ITensorInfo             *biases,
+                              const ITensorInfo             *dst,
+                              const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                              int                            gemm_3d_depth,
+                              bool                           skip_im2col,
+                              const ActivationLayerInfo     &act_info);
+
+    enum AuxTensorIdx
+    {
+        // ClGemmLowpMatrixMultiplyCore has up to 7 internal tensors
+        Im2ColOutput = 8,
+        WeightsReshaped,
+        GemmOutput,
+        Count
+    };
+
+    std::unique_ptr<kernels::ClWeightsReshapeKernel> _weights_reshape_kernel;
+    std::unique_ptr<kernels::ClIm2ColKernel>         _im2col_kernel;
+    std::unique_ptr<ClGemm>                          _mm_gemm;
+    std::unique_ptr<ClGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
+    std::unique_ptr<opencl::kernels::ClCol2ImKernel> _col2im_kernel;
+    std::unique_ptr<kernels::ClActivationKernel>     _activation_kernel;
+
+    TensorInfo _im2col_output;
+    TensorInfo _weights_reshaped;
+    TensorInfo _gemm_output;
+
+    bool _skip_im2col;
+    bool _skip_col2im;
+    bool _is_quantized;
+    bool _fuse_activation;
+    bool _append_bias;
+    bool _is_prepared;
+
+    experimental::MemoryRequirements _aux_mem;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLGEMMCONV2D_H
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
new file mode 100644
index 0000000000..71c247de79
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
@@ -0,0 +1,950 @@
+/*
+ * Copyright (c) 2017-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
+
+#include "arm_compute/core/Log.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClCastKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
+#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::cl_gemm;
+using namespace arm_compute::opencl::kernels;
+using namespace arm_compute::experimental;
+
+namespace
+{
+inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
+{
+    switch (kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+        {
+            return true;
+        }
+        default:
+        {
+            return false;
+        }
+    }
+}
+
+//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
+inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
+{
+    auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
+    if (bool(gemm_kernel))
+    {
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
+            return gemm_kernel.gemm_type;
+        }
+    }
+    gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
+    return gemm_kernel.gemm_type;
+}
+
+// Validate lhs_info and rhs_info for native kernel
+inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info,
+                                         const GEMMRHSMatrixInfo &rhs_info,
+                                         const ITensorInfo       *a,
+                                         const ITensorInfo       *b,
+                                         const GEMMReshapeInfo   &reshape_info)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo mm_result_s32_info{};
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(
+        mm_result_s32_info,
+        a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
+    // Validate mm kernel
+    // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
+    // NOTE: This assumes:
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
+    if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info,
+                                                             reshape_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query,
+                                                                               const ITensorInfo           *a,
+                                                                               const ITensorInfo           *b,
+                                                                               const GEMMReshapeInfo &reshape_info)
+{
+    auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
+    if (config)
+    {
+        if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
+        }
+    }
+    config = auto_heuristics::select_default_gemm_config_native(query);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ",
+                                              to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *output,
+                                                    unsigned int             m,
+                                                    unsigned int             n,
+                                                    unsigned int             k,
+                                                    bool                     reinterpret_input_as_3d,
+                                                    int                      depth_output_gemm3d)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo tmp_b_info{};
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
+    // NOTE: This assumes:
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    // Since we ignore the output stage, output data type has to be S32 to pass the validation
+    TensorInfo output_info_copy(*output);
+    output_info_copy.set_data_type(DataType::S32);
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                      gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+// Validate lhs_info and rhs_info for reshaped only rhs kernel
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info,
+                                                         const GEMMRHSMatrixInfo &rhs_info,
+                                                         const ITensorInfo       *a,
+                                                         const ITensorInfo       *b,
+                                                         const ITensorInfo       *output,
+                                                         unsigned int             m,
+                                                         unsigned int             n,
+                                                         unsigned int             k,
+                                                         bool                     reinterpret_input_as_3d,
+                                                         int                      depth_output_gemm3d)
+{
+    // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
+    TensorInfo tmp_b_info{};
+    // Validate reshape RHS kernel
+    auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    {
+        return false;
+    }
+    // Validate mm kernel
+    // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info
+    // NOTE: This assumes:
+    //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments).
+    //  2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window).
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    // Since we ignore the output stage, output data type has to be S32 to pass the validation
+    TensorInfo output_info_copy(*output);
+    output_info_copy.set_data_type(DataType::S32);
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                          gemm_kernel_info)))
+    {
+        return false;
+    }
+    return true;
+}
+
+// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          bool                         reinterpret_input_as_3d,
+                                          int                          depth_output_gemm3d,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *output)
+{
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
+    if (config)
+    {
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                    query.k, reinterpret_input_as_3d, depth_output_gemm3d))
+        {
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
+        }
+    }
+    config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+
+// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query,
+                                               bool                         reinterpret_input_as_3d,
+                                               int                          depth_output_gemm3d,
+                                               const ITensorInfo           *a,
+                                               const ITensorInfo           *b,
+                                               const ITensorInfo           *output)
+{
+    ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
+    auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
+    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                 query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
+}
+
+inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
+{
+    switch (kernel_type)
+    {
+        case CLGEMMKernelType::NATIVE:
+            return false;
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS:
+        case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
+            return true;
+        default:
+            ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!");
+    }
+}
+} // namespace
+
+ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore()
+    : _weights_to_qasymm8(std::make_unique<ClCastKernel>()),
+      _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()),
+      _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()),
+      _mm_reshaped_only_rhs_mmul_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel>()),
+      _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()),
+      _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()),
+      _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()),
+      _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()),
+      _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()),
+      _aux_mem(AuxTensorIdx::Count)
+{
+}
+
+ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;
+
+void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+                                             ITensorInfo            *a,
+                                             ITensorInfo            *b,
+                                             ITensorInfo            *c,
+                                             ITensorInfo            *output,
+                                             const GEMMInfo         &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
+    ARM_COMPUTE_LOG_PARAMS(a, b, c, output, gemm_info);
+
+    _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
+    _a_offset                    = a->quantization_info().uniform().offset;
+    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) &&
+                          is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8;
+    _b_offset  = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
+    _gemm_info = gemm_info;
+
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    // Set the target for the kernels
+    _mm_native_kernel->set_target(gpu_target);
+    _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
+    _mm_reshaped_only_rhs_mmul_kernel->set_target(gpu_target);
+
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
+
+    // Arguments used by GEMMReshapeInfo
+    // in order to know how the matrices have been reshaped
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+    const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run);
+
+    if (_convert_to_qasymm8)
+    {
+        // Set data type for converted weights
+        _qasymm8_weights = *b;
+        _qasymm8_weights.set_data_type(DataType::QASYMM8);
+        _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP);
+    }
+
+    ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+    {
+        matrix_b = &_tmp_b;
+
+        // Pick up the GEMM configuration
+        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+
+        // Configure reshape RHS kernel
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
+    }
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+    {
+        matrix_b = &_tmp_b;
+
+        // Pick up the GEMM configuration
+        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+
+        // Configure reshape RHS kernel
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
+    }
+
+    // Using default reduction info
+    const GEMMLowpReductionKernelInfo reduction_info{};
+
+    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0)
+    {
+        _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
+
+        // Configure Matrix B reduction kernel
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b,
+                                           &_vector_sum_col, reduction_info);
+    }
+
+    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
+    if (_b_offset != 0)
+    {
+        _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+        // Configure matrix A reduction kernel
+        _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info);
+    }
+
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    gemm_kernel_info.a_offset                = _a_offset;
+    gemm_kernel_info.b_offset                = _b_offset;
+    // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        // Configure offset contribution kernel
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
+
+        _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
+        _gemm_output_stage_shifts      = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
+
+        GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
+        gemmlowp_output_stage.output_data_type        = a->data_type();
+        if (num_filters == 1)
+        {
+            // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
+            // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
+            gemmlowp_output_stage.is_quantized_per_channel = false;
+        }
+
+        gemm_kernel_info.output_stage = gemmlowp_output_stage;
+
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS &&
+            gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            // Configure and tune matrix multiply kernel with fused output stage
+            _mm_reshaped_only_rhs_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+        }
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL &&
+                 gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            // Configure and tune matrix multiply kernel with fused output stage
+            _mm_reshaped_only_rhs_mmul_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+        }
+        else
+        {
+            _run_output_stage = true;
+
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+            {
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                        gemm_kernel_info);
+            }
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+            {
+                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                             gemm_kernel_info);
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                    _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
+
+                // Configure matrix multiply kernel
+                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info,
+                                             reshape_info);
+
+                _offset_contribution_output_stage_kernel->configure(
+                    compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0),
+                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers,
+                    &_gemm_output_stage_shifts);
+            }
+        }
+    }
+    else
+    {
+        _run_offset_contribution = true;
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
+        }
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        {
+            // Configure and tune matrix multiply kernel
+            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
+        }
+        else
+        {
+            // Pick up the GEMM configuration
+            // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
+
+            // Configure matrix multiply kernel
+            _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
+        }
+
+        // Configure offset contribution kernel
+        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                                               a->dimension(0), _a_offset, _b_offset);
+    }
+
+    // Request memory
+    _aux_mem[RhsQAsymm8] =
+        MemoryInfo(offset_int_vec(RhsQAsymm8),
+                   _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                   _qasymm8_weights.total_size());
+    if (is_gemm_reshaped(_gemm_kernel_type))
+    {
+        // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
+        _aux_mem[RhsQAsymm8] =
+            MemoryInfo(offset_int_vec(RhsQAsymm8),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary,
+                       _qasymm8_weights.total_size());
+        _aux_mem[RhsReshape] = MemoryInfo(
+            offset_int_vec(RhsReshape),
+            _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    }
+    if (_a_offset != 0)
+    {
+        _aux_mem[VecSumCol] =
+            MemoryInfo(offset_int_vec(VecSumCol),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                       _vector_sum_col.total_size());
+    }
+    if (_b_offset != 0)
+    {
+        _aux_mem[VecSumRow] =
+            MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    }
+    _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent,
+                                       _gemm_output_stage_multipliers.total_size());
+    _aux_mem[Shifts] =
+        MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
+}
+
+Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
+
+    int32_t a_offset = a->quantization_info().uniform().offset;
+    int32_t b_offset = b->quantization_info().uniform().offset;
+
+    const ITensorInfo *matrix_a_info = a;
+
+    TensorInfo        tmp_b_info{};
+    GEMMRHSMatrixInfo rhs_info;
+    GEMMLHSMatrixInfo lhs_info;
+
+    // Get the GPU target
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+
+    bool reshape_matrix_b = is_gemm_reshaped(
+        auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size},
+                                gemm_info.reshape_b_only_on_first_run()));
+
+    const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
+
+    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) &&
+                              is_data_type_quantized_symmetric(b->data_type()) &&
+                              is_data_type_quantized_asymmetric(a->data_type());
+    TensorInfo weights_info(*b);
+    if (convert_to_qasymm8)
+    {
+        b_offset = -128;
+        weights_info.set_data_type(DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
+    }
+    const ITensorInfo *matrix_b_info = &weights_info;
+    if (reshape_matrix_b)
+    {
+        matrix_b_info = &tmp_b_info;
+
+        // Pick up the GEMM configuration
+        // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+        // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+        const auto res = select_default_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+        lhs_info = res.lhs_info;
+        rhs_info = res.rhs_info;
+
+        // Validate reshape RHS kernel
+        auto_init_if_empty(tmp_b_info,
+                           weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
+    }
+
+    TensorInfo info_vector_sum_col{};
+    TensorInfo info_vector_sum_row{};
+
+    const GEMMLowpReductionKernelInfo reduction_info;
+    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
+    if (a_offset != 0)
+    {
+        info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
+
+        // Configure Matrix B reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
+    }
+
+    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
+    if (b_offset != 0)
+    {
+        info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
+
+        // Configure matrix A reduction kernel
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
+    }
+
+    GEMMKernelInfo gemm_kernel_info;
+    gemm_kernel_info.m                       = m;
+    gemm_kernel_info.n                       = n;
+    gemm_kernel_info.k                       = k;
+    gemm_kernel_info.depth_output_gemm3d     = depth_output_gemm3d;
+    gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d;
+    gemm_kernel_info.lhs_info                = lhs_info;
+    gemm_kernel_info.rhs_info                = rhs_info;
+    gemm_kernel_info.a_offset                = a_offset;
+    gemm_kernel_info.b_offset                = b_offset;
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    {
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
+
+        const TensorInfo gemm_output_stage_multipliers_shifts_info(
+            TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+
+        GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
+        gemmlowp_output_stage.output_data_type        = a->data_type();
+
+        gemm_kernel_info.output_stage = gemmlowp_output_stage;
+        if (reshape_matrix_b &&
+            gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        {
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info,
+                &gemm_output_stage_multipliers_shifts_info));
+        }
+        else
+        {
+            TensorInfo mm_result_s32_info{};
+
+            if (reshape_matrix_b)
+            {
+                // Output tensor auto inizialitation if not yet initialized
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, reshape_info))
+                                                           .set_data_type(DataType::S32));
+
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+            }
+            else
+            {
+                // Output tensor auto inizialitation if not yet initialized
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, false, reshape_info))
+                                                           .set_data_type(DataType::S32));
+
+                // Pick up the GEMM configuration
+                // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
+                // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+                const auto res = select_default_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+                lhs_info = res.lhs_info;
+                rhs_info = res.rhs_info;
+
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+            }
+
+            // Validate offset contribution kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage,
+                &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info));
+        }
+    }
+    else
+    {
+        if (reshape_matrix_b)
+        {
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info));
+        }
+        else
+        {
+            // Pick up the GEMM configuration
+            // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
+            const auto res = select_default_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+            lhs_info = res.lhs_info;
+            rhs_info = res.rhs_info;
+
+            // Validate matrix multiply
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+        }
+
+        if (output->total_size() != 0)
+        {
+            // Validate offset contribution kernel
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                c, a_offset, b_offset));
+        }
+    }
+
+    return Status{};
+}
+
+void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors)
+{
+    const ITensor *a   = tensors.get_const_tensor(ACL_SRC_0);
+    const ITensor *b   = tensors.get_const_tensor(ACL_SRC_1);
+    const ITensor *c   = tensors.get_const_tensor(ACL_SRC_2);
+    ITensor       *dst = tensors.get_tensor(ACL_DST);
+
+    ARM_COMPUTE_ERROR_ON_NULLPTR(a, dst);
+
+    CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
+    CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true);
+    CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true);
+    CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+    CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true);
+    CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true);
+    CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true);
+
+    // Prepare the consts if needed
+    prepare(tensors);
+
+    const ITensor *matrix_a = a;
+    const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
+
+    if (is_gemm_reshaped(_gemm_kernel_type))
+    {
+        matrix_b = tmp_b.get();
+        if (!_reshape_b_only_on_first_run)
+        {
+            // Run reshape matrix B
+            ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                              {TensorType::ACL_DST, tmp_b.get()}};
+            CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
+        }
+    }
+
+    // Run matrix B reduction kernel only if _a_offset is not equal to 0
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
+    {
+        ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, vec_sum_col.get()}};
+        CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
+    }
+
+    // Run matrix A reduction kernel only if _b_offset is not equal to 0
+    if (_b_offset != 0)
+    {
+        ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}};
+        CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
+    }
+
+    // Run matrix multiply
+    if (is_gemm_reshaped(_gemm_kernel_type))
+    {
+        ITensorPack gemm_reshaped_pack;
+        if (_run_offset_contribution)
+        {
+            gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a},
+                                              {TensorType::ACL_SRC_1, matrix_b},
+                                              {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}});
+        }
+        else
+        {
+            gemm_reshaped_pack = ITensorPack({
+                {TensorType::ACL_SRC, matrix_a},
+                {TensorType::ACL_SRC_1, matrix_b},
+                {TensorType::ACL_BIAS, c},
+                {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+                {TensorType::ACL_SHIFTS, shifts.get()},
+                {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+                {TensorType::ACL_DST, dst},
+            });
+        }
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        {
+            CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
+        }
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        {
+            CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
+        }
+        else
+        {
+            ARM_COMPUTE_ERROR("Invalid reshaped kernel");
+        }
+    }
+    else
+    {
+        ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a},
+                                        {TensorType::ACL_SRC_1, matrix_b},
+                                        {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}};
+        CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
+    }
+    if (_run_output_stage)
+    {
+        // Run offset contribution/output stage kernel
+        ITensorPack output_stage_pack = {
+            {TensorType::ACL_SRC, res32.get()},
+            {TensorType::ACL_BIAS, c},
+            {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+            {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+            {TensorType::ACL_SHIFTS, shifts.get()},
+            {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+            {TensorType::ACL_DST, dst},
+        };
+        CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
+    }
+    if (_run_offset_contribution)
+    {
+        // Run offset contribution kernel
+        ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst},
+                                           {TensorType::ACL_BIAS, c},
+                                           {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                                           {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}};
+        CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
+    }
+}
+
+void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        auto               b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
+        CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true);
+        CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false);
+
+        ARM_COMPUTE_ERROR_ON_NULLPTR(b);
+
+        if (_convert_to_qasymm8)
+        {
+            ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}};
+            CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
+            b->mark_as_unused();
+        }
+
+        if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
+        {
+            // Run reshape kernel and mark original weights tensor as unused
+            ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, tmp_b.get()}};
+            CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
+            b->mark_as_unused();
+        }
+
+        // Run matrix B reduction kernel only if _a_offset is not equal to 0
+        if (_a_offset != 0 && _reshape_b_only_on_first_run)
+        {
+            ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                          {TensorType::ACL_DST, vec_sum_col.get()}};
+            CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
+        }
+
+        // Compute GEMM output multipliers and shifts for output stage
+        {
+            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                           ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                           : 1;
+
+            CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
+            CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
+
+            ICLTensor *multiplier_tensor = multipliers.get();
+            if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
+            {
+                multiplier_tensor->map(CLScheduler::get().queue(), true);
+                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(),
+                            num_filters * sizeof(int32_t));
+                multiplier_tensor->unmap(CLScheduler::get().queue());
+            }
+
+            ICLTensor *shifts_tensor = shifts.get();
+            if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
+            {
+                shifts_tensor->map(CLScheduler::get().queue(), true);
+                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+                shifts_tensor->unmap(CLScheduler::get().queue());
+            }
+        }
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClGemmLowpMatrixMultiplyCore::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
new file mode 100644
index 0000000000..c80dc3a182
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/GEMMInfo.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+// Forward declarations
+class ClCastKernel;
+class ClGemmLowpMatrixMultiplyNativeKernel;
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel;
+class ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel;
+class ClGemmReshapeRhsMatrixKernel;
+class ClGemmLowpMatrixAReductionKernel;
+class ClGemmLowpMatrixBReductionKernel;
+class ClGemmLowpOffsetContributionKernel;
+class ClGemmLowpOffsetContributionOutputStageKernel;
+} // namespace kernels
+
+/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */
+class ClGemmLowpMatrixMultiplyCore : public IClOperator
+{
+public:
+    ClGemmLowpMatrixMultiplyCore();
+    ~ClGemmLowpMatrixMultiplyCore();
+    /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1               |src2     |dst            |
+     * |:--------------|:------------------|:--------|:--------------|
+     * |QASYMM8        |QASYMM8            |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |QASYMM8        |
+     * |QASYMM8        |QSYMM8             |S32      |QASYMM8        |
+     * |QASYMM8        |QASYMM8            |S32      |S32            |
+     * |QASYMM8        |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8        |QSYMM8             |S32      |S32            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |QASYMM8_SIGNED |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED     |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32      |S32            |
+     * |QASYMM8_SIGNED |QSYMM8             |S32      |S32            |
+     *
+     * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
+     *  This kernel performs the following computations:
+     *
+     *  -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them.
+     *  -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them.
+     *  -# Compute the matrix product of the resulting a * b in int32.
+     *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  a               First input tensor  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a
+     * @param[in]  c               Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
+     * @param[out] output          Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE
+     * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
+     *                       if the reshape of matrix B should be executed only for the first run
+     */
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClGemmLowpMatrixMultiplyCore::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum AuxTensorIdx
+    {
+        ResultS32 = 0,
+        RhsQAsymm8,
+        RhsReshape,
+        VecSumCol,
+        VecSumRow,
+        Multipliers,
+        Shifts,
+        Count
+    };
+
+private:
+    // Kernels used
+    std::unique_ptr<kernels::ClCastKernel>                                      _weights_to_qasymm8;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel>              _mm_native_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>     _mm_reshaped_only_rhs_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel> _mm_reshaped_only_rhs_mmul_kernel;
+    std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel>                      _mtx_b_reshape_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>                  _mtx_a_reduction_kernel;
+    std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>                  _mtx_b_reduction_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>                _offset_contribution_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
+
+    // Temporary tensors
+    TensorInfo _qasymm8_weights{};
+    TensorInfo _vector_sum_col{};
+    TensorInfo _vector_sum_row{};
+    TensorInfo _tmp_b{};
+    TensorInfo _mm_result_s32{};
+    TensorInfo _gemm_output_stage_multipliers{};
+    TensorInfo _gemm_output_stage_shifts{};
+
+    int32_t          _a_offset{0};
+    int32_t          _b_offset{0};
+    bool             _reshape_b_only_on_first_run{false};
+    bool             _run_output_stage{false};
+    bool             _convert_to_qasymm8{false};
+    bool             _run_offset_contribution{false};
+    bool             _is_prepared{false};
+    GEMMInfo         _gemm_info{};
+    CLGEMMKernelType _gemm_kernel_type{};
+
+    experimental::MemoryRequirements _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
new file mode 100644
index 0000000000..e3363e3685
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
+#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClGemmLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ITensorInfo             *src,
+                                      const ITensorInfo             *bias,
+                                      ITensorInfo                   *dst,
+                                      const GEMMLowpOutputStageInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
+
+    switch (info.type)
+    {
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+        {
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel>();
+            k->configure(compile_context, src, bias, dst, &info);
+            _kernel = std::move(k);
+            break;
+        }
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+        {
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel>();
+            k->configure(compile_context, src, bias, dst, &info);
+            _kernel = std::move(k);
+            break;
+        }
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+        {
+            auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel>();
+            k->configure(compile_context, src, bias, dst, &info);
+            _kernel = std::move(k);
+            break;
+        }
+        default:
+            ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type.");
+    }
+}
+
+Status ClGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *dst,
+                                       const GEMMLowpOutputStageInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
+
+    switch (info.type)
+    {
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info);
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN:
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info);
+        case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT:
+            return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(src, bias, dst, &info);
+        default:
+            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type.");
+    }
+}
+
+void ClGemmLowpOutputStage::run(ITensorPack &tensors)
+{
+    const ITensor *src  = tensors.get_const_tensor(ACL_SRC);
+    const ITensor *bias = tensors.get_const_tensor(ACL_BIAS);
+    ITensor       *dst  = tensors.get_tensor(ACL_DST);
+
+    ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}};
+    CLScheduler::get().enqueue_op(*_kernel, pack, true);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
new file mode 100644
index 0000000000..6357e0200b
--- /dev/null
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H
+#define ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+/** This file contains all available output stages for GEMMLowp on OpenCL.
+ *
+ *  In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore),
+ *  and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value.
+ *
+ *  More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md
+ */
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to execute GEMMLowpQuantizeDown kernels on CL.
+ *
+ *  This function calls the following CL kernels:
+ *
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel
+ * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel
+*/
+class ClGemmLowpOutputStage : public IClOperator
+{
+public:
+    /** Constructor */
+    ClGemmLowpOutputStage() = default;
+    /** Initialise the kernel's inputs, output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1          |dst           |
+     * |:--------------|:-------------|:-------------|
+     * |S32            |S32           |QASYMM8       |
+     * |S32            |S32           |QASYMM8_SIGNED|
+     * |S32            |S32           |QSYMM16       |
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. Data type supported: S32
+     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src.
+     * @param[out] dst             Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
+     * @param[in]  info            GEMMLowp output stage metadata.
+     */
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClGemmLowpOutputStage::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H */
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp
new file mode 100644
index 0000000000..777fc9e5e1
--- /dev/null
+++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
+
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
+#include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h"
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
+
+using namespace arm_compute::cl_indirect_conv;
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::experimental;
+
+namespace
+{
+DirectConvComputeKernelInfo
+config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+{
+    // Get GPU target
+    GPUTarget gpu_target = CLScheduler::get().target();
+
+    std::unique_ptr<IClIndirectConvKernelConfig> t = ClIndirectConvKernelConfigurationFactory::create(gpu_target);
+
+    return t->configure(src, weights, conv_info);
+}
+
+} // namespace
+
+void ClIndirectConv2d::configure(const CLCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
+
+    // Reuse the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info);
+
+    // Configure indirect convolution kernels
+    auto k0 = std::make_unique<kernels::ClIndirectConv2dAddressPrecalculationKernel>();
+    auto k1 = std::make_unique<kernels::ClIndirectConv2dKernel>();
+
+    k0->set_target(CLScheduler::get().target());
+    k1->set_target(CLScheduler::get().target());
+
+    k0->configure(compile_context, src, weights, &_indirect_buffer, conv_info, desc);
+    k1->configure(compile_context, src, weights, biases, &_indirect_buffer, dst, conv_info, act_info, desc);
+
+    _addr_precalculation_kernel = std::move(k0);
+    _indirect_conv_kernel       = std::move(k1);
+    _is_prepared                = false;
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel);
+
+    // Request memory for the indirect buffer
+    _aux_mem[IndirectBuffer] =
+        MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
+}
+
+Status ClIndirectConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info)
+{
+    // Initialize the direct convolution descriptor
+    const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info);
+
+    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
+
+    TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32);
+
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(
+        src, weights, &indirect_buffer, conv_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst,
+                                                                          conv_info, act_info, desc));
+
+    return Status{};
+}
+
+void ClIndirectConv2d::run(ITensorPack &tensors)
+{
+    CLAuxTensorHandler indirect_buffer(offset_int_vec(IndirectBuffer), _indirect_buffer, tensors, true);
+
+    prepare(tensors);
+
+    ITensorPack indirect_conv2d_pack(tensors);
+    indirect_conv2d_pack.add_const_tensor(ACL_SRC_3, indirect_buffer.get());
+
+    // Run indirect convolution
+    CLScheduler::get().enqueue_op(*_indirect_conv_kernel, indirect_conv2d_pack, true);
+}
+
+void ClIndirectConv2d::prepare(ITensorPack &constants)
+{
+    if (!_is_prepared)
+    {
+        ICLTensor *indirect_buffer_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
+        ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr);
+
+        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer");
+
+        CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux);
+        ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr);
+
+        ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}};
+        CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true);
+
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClIndirectConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h
new file mode 100644
index 0000000000..29e796efd9
--- /dev/null
+++ b/src/gpu/cl/operators/ClIndirectConv2d.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_INDIRECT_CONV2D_H
+#define ARM_COMPUTE_CL_INDIRECT_CONV2D_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/CLTypes.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declaration
+struct DirectConvComputeKernelInfo;
+
+namespace opencl
+{
+/** Basic function to execute indirect convolution on OpenCL. This function calls the following OpenCL kernels:
+ *
+ *  -# @ref kernels::ClIndirectConv2dAddressPrecalculationKernel
+ *  -# @ref kernels::ClIndirectConv2dKernel
+ */
+class ClIndirectConv2d : public IClOperator
+{
+public:
+    ClIndirectConv2d() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - NHWC
+     *
+     * Valid data type configurations:
+     * |src0         |src1        |src2      |dst            |
+     * |:------------|:-----------|:---------|:--------------|
+     * |F32          |F32         |F32       |F32            |
+     * |F16          |F16         |F16       |F16            |
+     *
+     * @note All tensors must have the same data type.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. 3 lower dimensions represent a single src,
+     *                             while every optional dimension from 4 and above represent a batch of sources.
+     *                             Data types supported: F16/F32.
+     * @param[in]  weights         Weights tensor. Weights are 4D tensor with dimensions. Data type supported:Same as @p src.
+     * @param[in]  biases          Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                             Data type supported: Should match @p src data type.
+     * @param[out] dst             Destination tensor. 3 lower dimensions represent a single dst, while the rest represent batch of destinations.
+     *                             Data types supported: Same as @p src.
+     * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
+     *
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClIndirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum AuxTensorIdx
+    {
+        IndirectBuffer = 0,
+        Count
+    };
+
+    std::unique_ptr<IClKernel>       _indirect_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{nullptr};
+    TensorInfo                       _indirect_buffer{};
+    bool                             _is_prepared{false};
+    experimental::MemoryRequirements _aux_mem{Count};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_INDIRECT_CONV2D_H */
diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp
new file mode 100644
index 0000000000..d8d4186d00
--- /dev/null
+++ b/src/gpu/cl/operators/ClLogicalNot.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClLogicalNot.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>();
+    k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT);
+    _kernel = std::move(k);
+}
+
+Status ClLogicalNot::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOGICAL_NOT);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClLogicalNot.h b/src/gpu/cl/operators/ClLogicalNot.h
new file mode 100644
index 0000000000..31d4a99be6
--- /dev/null
+++ b/src/gpu/cl/operators/ClLogicalNot.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_LOGICAL_NOT_H
+#define ARM_COMPUTE_CL_LOGICAL_NOT_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClElementWiseUnaryKernel for NOT operation */
+class ClLogicalNot : public IClOperator
+{
+public:
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: U8.
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClLogicalNot::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_LOGICAL_NOT_H */
diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
new file mode 100644
index 0000000000..28a2aa2540
--- /dev/null
+++ b/src/gpu/cl/operators/ClMatMul.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClMatMul.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
+#include "src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
+#include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelVariant.h"
+#include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
+
+using namespace arm_compute::cl_matmul;
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::opencl::kernels;
+
+ClMatMul::ClMatMul()
+{
+}
+
+Status ClMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *dst,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+
+    const GPUTarget gpu_target = CLScheduler::get().target();
+
+    std::unique_ptr<IClMatMulNativeKernelConfig> t = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+
+    const MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
+
+    const auto             kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target);
+    const MatMulKernelType kernel_type     = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info);
+
+    switch (kernel_type)
+    {
+        case MatMulKernelType::NATIVE_FP:
+            return ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        case MatMulKernelType::NATIVE_MMUL_FP:
+            return ClMatMulNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info);
+        case MatMulKernelType::NATIVE_QUANTIZED:
+            return ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        case MatMulKernelType::NATIVE_MMUL_QUANTIZED:
+            return ClMatMulLowpNativeMMULKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        default:
+            ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!");
+    }
+}
+
+void ClMatMul::configure(const CLCompileContext    &compile_context,
+                         ITensorInfo               *lhs,
+                         ITensorInfo               *rhs,
+                         ITensorInfo               *dst,
+                         const MatMulInfo          &matmul_info,
+                         const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
+    ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, dst, matmul_info));
+
+    const GPUTarget        gpu_target    = CLScheduler::get().target();
+    const auto             kernel_config = ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+    const MatMulKernelInfo kernel_info   = kernel_config->configure(lhs, rhs, matmul_info);
+
+    const auto             kernel_selector = ClMatMulNativeKernelVariantFactory::create(gpu_target);
+    const MatMulKernelType kernel_type     = kernel_selector->select_kernel(lhs, rhs, matmul_info, act_info);
+
+    switch (kernel_type)
+    {
+        case MatMulKernelType::NATIVE_FP:
+        {
+            auto kernel = std::make_unique<ClMatMulNativeKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_MMUL_FP:
+        {
+            auto kernel = std::make_unique<ClMatMulNativeMMULKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_QUANTIZED:
+        {
+            auto kernel = std::make_unique<ClMatMulLowpNativeKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        case MatMulKernelType::NATIVE_MMUL_QUANTIZED:
+        {
+            auto kernel = std::make_unique<ClMatMulLowpNativeMMULKernel>();
+            kernel->set_target(gpu_target);
+
+            kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+            _matmul_kernel = std::move(kernel);
+        }
+        break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported MatMul Kernel!");
+    }
+}
+
+void ClMatMul::run(ITensorPack &tensors)
+{
+    CLScheduler::get().enqueue_op(*_matmul_kernel, tensors, /* flush */ true);
+}
+
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
new file mode 100644
index 0000000000..1733def21c
--- /dev/null
+++ b/src/gpu/cl/operators/ClMatMul.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "arm_compute/function_info/MatMulInfo.h"
+
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic operator to execute BatchMatMul on OpenCL. This operator calls the following OpenCL kernels:
+ *
+ *  -# @ref kernels::ClMatMulNativeKernel
+ */
+class ClMatMul : public IClOperator
+{
+public:
+    /** Constructor */
+    ClMatMul();
+    /** Default destructor */
+    ~ClMatMul() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |lhs            |rhs            |dst            |
+     * |:--------------|:--------------|:--------------|
+     * |F32            |F32            |F32            |
+     * |F16            |F16            |F16            |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |QASYMM8_SIGNED |
+     * |QASYMM8        |QASYMM8        |QASYMM8        |
+     *
+     * @note BatchMatMul: Batched Matrix Multiply - [A * B], Multiplies all slices (slice is an element of a batch) of Tensors A and B
+     *                    and stores the result in the dst tensor of the same batch size.
+     *                    Batch here is number of slices from A and B multiplied at a time, do not confuse with the batch dimension 'N' of NHWC/NCHW
+     *                    For NHWC for example: the batch is the higher dimensions H * N, and in general it is H * all higher dimensions.
+     * @note All tensors must have the same data type.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  lhs             Left-hand side tensor info. Data types supported: F16/F32/QASYMM8_SIGNED/QASYMM8.
+     * @param[in]  rhs             Right-hand side tensor info. Data types supported: same as @p lhs.
+     * @param[out] dst             Output tensor to store the result of the batched matrix multiplication. Data types supported: same as @p lhs.
+     * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
+     * @param[in]  act_info        Class containing information about fused activation function.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &matmul_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMatMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<opencl::IClKernel> _matmul_kernel{nullptr};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLMATMUL_H
diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp
new file mode 100644
index 0000000000..10cf8a6a38
--- /dev/null
+++ b/src/gpu/cl/operators/ClMul.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClMul.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClMulKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClMul::configure(const CLCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      float                      scale,
+                      ConvertPolicy              overflow_policy,
+                      RoundingPolicy             rounding_policy,
+                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+    auto k = std::make_unique<kernels::ClMulKernel>();
+    k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClMul::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
+                       const ActivationLayerInfo &act_info)
+{
+    return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
+}
+
+void ClComplexMul::configure(const CLCompileContext    &compile_context,
+                             ITensorInfo               *src1,
+                             ITensorInfo               *src2,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
+{
+    auto k = std::make_unique<kernels::ClComplexMulKernel>();
+    k->configure(compile_context, src1, src2, dst, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClComplexMul::validate(const ITensorInfo         *src1,
+                              const ITensorInfo         *src2,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info)
+{
+    return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h
new file mode 100644
index 0000000000..1cf4d68d4c
--- /dev/null
+++ b/src/gpu/cl/operators/ClMul.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_MUL_H
+#define ARM_COMPUTE_CL_MUL_H
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref opencl::kernels::ClMulKernel */
+class ClMul : public IClOperator
+{
+public:
+    /** Initialise the kernel's sources, dst and convertion policy.
+     *
+     * Valid configurations (src1,src2) -> Output :
+     *
+     *   - (U8,U8)                         -> U8
+     *   - (U8,U8)                         -> S16
+     *   - (U8,S16)                        -> S16
+     *   - (S16,U8)                        -> S16
+     *   - (S16,S16)                       -> S16
+     *   - (F16,F16)                       -> F16
+     *   - (F32,F32)                       -> F32
+     *   - (QASYMM8,QASYMM8)               -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16)               -> QSYMM16
+     *   - (QSYMM16,QSYMM16)               -> S32
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32.
+     * @param[in]      scale           Scale to apply after multiplication.
+     *                                 Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
+     * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
+     * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+
+/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
+class ClComplexMul : public IClOperator
+{
+public:
+    /** Initialise the kernel's sources, dst.
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            An src tensor info. Data types supported: F16/F32. Number of channels supported: 2.
+     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+     *                                 The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClComplexMul::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_MUL_H */
diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp
new file mode 100644
index 0000000000..f3efd00bba
--- /dev/null
+++ b/src/gpu/cl/operators/ClPRelu.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPRelu.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using KernelType = kernels::ClArithmeticKernel;
+void ClPRelu::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *input,
+                        ITensorInfo            *alpha,
+                        ITensorInfo            *output)
+{
+    ARM_COMPUTE_LOG_PARAMS(input, alpha, output);
+    auto k = std::make_unique<KernelType>();
+    k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
+    _kernel = std::move(k);
+}
+
+Status ClPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+    return KernelType::validate(ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output));
+}
+
+void ClPRelu::run(ITensorPack &tensors)
+{
+    // Output tensor can be given as nullptr for in-place computation.
+    // In this case, get the input tensor and use it as the output tensor.
+    if (tensors.get_tensor(TensorType::ACL_DST) == nullptr)
+    {
+        auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+        ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
+        tensors.add_tensor(TensorType::ACL_DST, src_tensor);
+    }
+    IClOperator::run(tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h
new file mode 100644
index 0000000000..45ce858fb0
--- /dev/null
+++ b/src/gpu/cl/operators/ClPRelu.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_PRELU_H
+#define ARM_COMPUTE_CL_PRELU_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic operator to run @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU
+ *
+ * @note The operator implements an activation layer with the PRELU activation function.
+ */
+class ClPRelu : public IClOperator
+{
+public:
+    /** Set the input and output tensor.
+     *
+     * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
+     * @param[out] output          Destination tensor. Data type supported: same as @p input
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPRelu::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_PRELU_H */
diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp
new file mode 100644
index 0000000000..3851e22b6a
--- /dev/null
+++ b/src/gpu/cl/operators/ClPermute.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPermute.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClPermuteKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPermute::configure(const ClCompileContext  &compile_context,
+                          const ITensorInfo       *src,
+                          ITensorInfo             *dst,
+                          const PermutationVector &perm)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, perm);
+    auto k = std::make_unique<kernels::ClPermuteKernel>();
+    k->configure(compile_context, src, dst, perm);
+    _kernel = std::move(k);
+}
+
+Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
+{
+    return kernels::ClPermuteKernel::validate(src, dst, perm);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h
new file mode 100644
index 0000000000..6349358a18
--- /dev/null
+++ b/src/gpu/cl/operators/ClPermute.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_PERMUTE_H
+#define ARM_COMPUTE_CL_PERMUTE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClPermuteKernel */
+class ClPermute : public IClOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs and permute vector
+     *
+     * @note Arbitrary permutation vectors are supported with rank not greater than 4
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             The src tensor info. Data types supported: All.
+     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
+     * @param[in] perm            Permutation vector
+     */
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPermute::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_PERMUTE_H */
diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
new file mode 100644
index 0000000000..e4507dc1a1
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool2d.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPool2d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClPool2dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPool2d::configure(const ClCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *dst,
+                         const PoolingLayerInfo &info,
+                         ITensorInfo            *indices)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices);
+
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::ClPool2dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, dst, info, indices);
+    _kernel = std::move(k);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status ClPool2d::validate(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &info,
+                          const ITensorInfo      *indices)
+{
+    return kernels::ClPool2dKernel::validate(src, dst, info, indices);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h
new file mode 100644
index 0000000000..9c2fd1c3f2
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool2d.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL2D_H
+#define ARM_COMPUTE_CL_POOL2D_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::ClPool2d
+ */
+class ClPool2d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClPool2d() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
+     * @param[out] dst             Destination tensor info. Data type supported: same as @p src
+     * @param[in]  info            Pooling layer parameters.
+     * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
+     */
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &info,
+                   ITensorInfo            *indices = nullptr);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &info,
+                           const ITensorInfo      *indices = nullptr);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL2D_H */
diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp
new file mode 100644
index 0000000000..d230413659
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool3d.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClPool3d.h"
+
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClPool3dKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClPool3d::configure(const ClCompileContext   &compile_context,
+                         const ITensorInfo        *src,
+                         ITensorInfo              *dst,
+                         const Pooling3dLayerInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
+
+    // Configure pooling kernel
+    auto k = std::make_unique<kernels::ClPool3dKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, dst, info);
+    _kernel = std::move(k);
+
+    // Tune kernels
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status ClPool3d::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &info)
+{
+    return kernels::ClPool3dKernel::validate(src, dst, info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h
new file mode 100644
index 0000000000..9fd78bfd69
--- /dev/null
+++ b/src/gpu/cl/operators/ClPool3d.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_POOL3D_H
+#define ARM_COMPUTE_CL_POOL3D_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::ClPool3d
+ */
+class ClPool3d : public IClOperator
+{
+public:
+    /** Constructor */
+    ClPool3d() = default;
+    /** Configure operator for a given list of arguments
+     *
+     * @note Asymmetric padding is not supported when dimension rounding type == CEIL.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info.
+     * @param[out] dst             Destination tensor info.
+     * @param[in]  info            3d Pooling layer parameters.
+     */
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClPool3d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &info);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_POOL3D_H */
diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp
new file mode 100644
index 0000000000..8560b5553e
--- /dev/null
+++ b/src/gpu/cl/operators/ClQuantize.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClQuantize.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClQuantizeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClQuantizeKernel::validate(src, dst);
+}
+
+void ClQuantize::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClQuantize.h b/src/gpu/cl/operators/ClQuantize.h
new file mode 100644
index 0000000000..3e50fcefb3
--- /dev/null
+++ b/src/gpu/cl/operators/ClQuantize.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_QUANTIZE_H
+#define ARM_COMPUTE_CL_QUANTIZE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */
+class ClQuantize : public IClOperator
+{
+public:
+    /** Set the input and output tensors.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32.
+     * @param[out] dst             Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this function
+     */
+    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClQuantize::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_QUANTIZE_H */
diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp
new file mode 100644
index 0000000000..1dd5b760cb
--- /dev/null
+++ b/src/gpu/cl/operators/ClReshape.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClReshape.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClReshapeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClReshapeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClReshapeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClReshape.h b/src/gpu/cl/operators/ClReshape.h
new file mode 100644
index 0000000000..fee69a1c24
--- /dev/null
+++ b/src/gpu/cl/operators/ClReshape.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_RESHAPE_H
+#define ARM_COMPUTE_CL_RESHAPE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClReshapeKernel */
+class ClReshape : public IClOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data type supported: All
+     * @param[out] output          Output info. Data type supported: Same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClReshape::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_RESHAPE_H */
+\ No newline at end of file
diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp
new file mode 100644
index 0000000000..184e2aa006
--- /dev/null
+++ b/src/gpu/cl/operators/ClScale.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClScale.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClScaleKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClScale::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *src,
+                        ITensorInfo            *dst,
+                        const ScaleKernelInfo  &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src);
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
+
+    // Configure Scale kernel
+    auto k = std::make_unique<kernels::ClScaleKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, src, dst, info);
+    _kernel = std::move(k);
+
+    // Tune kernel
+    CLScheduler::get().tune_kernel_static(*_kernel);
+}
+
+Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
+{
+    return kernels::ClScaleKernel::validate(src, dst, info);
+}
+
+void ClScale::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h
new file mode 100644
index 0000000000..1427bb4fdc
--- /dev/null
+++ b/src/gpu/cl/operators/ClScale.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SCALE_H
+#define ARM_COMPUTE_CL_SCALE_H
+
+#include "arm_compute/core/KernelDescriptors.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref kernels::ClScaleKernel
+ */
+class ClScale : public IClOperator
+{
+public:
+    /** Constructor */
+    ClScale() = default;
+    /** Initialize the function's source, destination, interpolation type and border_mode.
+     *
+     * @param[in]     compile_context The compile context to be used.
+     * @param[in,out] src             Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
+     * @param[out]    dst             Destination tensor info. Data types supported: Same as @p src
+     *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
+     * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
+     */
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClScale::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CLSCALE_H */
diff --git a/src/gpu/cl/operators/ClScatter.cpp b/src/gpu/cl/operators/ClScatter.cpp
new file mode 100644
index 0000000000..a11ecd7e6a
--- /dev/null
+++ b/src/gpu/cl/operators/ClScatter.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClScatter.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClCopyKernel.h"
+#include "src/gpu/cl/kernels/ClFillKernel.h"
+#include "src/gpu/cl/kernels/ClScatterKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+using namespace arm_compute::opencl::kernels;
+
+ClScatter::ClScatter()
+{
+}
+
+Status ClScatter::validate(const ITensorInfo *src,
+                           const ITensorInfo *updates,
+                           const ITensorInfo *indices,
+                           const ITensorInfo *dst,
+                           const ScatterInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(updates, indices, dst);
+    if (src != nullptr)
+    {
+        // Check dst/src are same shape and datatype.
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, updates, dst);
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCopyKernel::validate(src, dst)); // Validate Copy kernel
+    }
+    if (src != dst)
+    {
+        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClFillKernel::validate(dst, PixelValue(0.0f))); // Validate Fill kernel.
+    }
+
+    return kernels::ClScatterKernel::validate(updates, indices, dst, info);
+}
+
+void ClScatter::configure(const CLCompileContext &compile_context,
+                          const ITensorInfo      *src,
+                          const ITensorInfo      *updates,
+                          const ITensorInfo      *indices,
+                          ITensorInfo            *dst,
+                          const ScatterInfo      &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(updates, indices, dst);
+    ARM_COMPUTE_LOG_PARAMS(src, indices, dst, info);
+
+    // Perform validation step
+    ARM_COMPUTE_ERROR_THROW_ON(validate(src, updates, indices, dst, info));
+    _fill_zero = info.zero_initialization;
+
+    // If necessary, create fill kernel to fill dst tensor.
+    if (_fill_zero)
+    {
+        auto f = std::make_unique<kernels::ClFillKernel>();
+        f->configure(compile_context, dst, PixelValue(0.0f));
+        _fill_kernel = std::move(f);
+    }
+    else if (src != dst) // Check whether copying is necessary
+    {
+        // Fill dst with src copy here.
+        auto j = std::make_unique<kernels::ClCopyKernel>();
+        j->configure(compile_context, src, dst);
+        _copy_kernel = std::move(j);
+        _run_copy    = true;
+    }
+
+    // Configure ClScatterKernel
+    auto k = std::make_unique<kernels::ClScatterKernel>();
+    k->set_target(CLScheduler::get().target());
+    k->configure(compile_context, updates, indices, dst, info);
+    _scatter_kernel = std::move(k);
+}
+
+void ClScatter::run(ITensorPack &tensors)
+{
+    // Get tensors.
+    auto src     = tensors.get_const_tensor(ACL_SRC_0);
+    auto updates = tensors.get_const_tensor(ACL_SRC_1);
+    auto indices = tensors.get_const_tensor(ACL_SRC_2);
+    auto dst     = tensors.get_tensor(ACL_DST);
+
+    if (_fill_zero)
+    {
+        // Fill destination tensor with 0 values if zero init.
+        ITensorPack fill_pack{{ACL_SRC, dst}};
+        CLScheduler::get().enqueue_op(*_fill_kernel, fill_pack, false);
+    }
+
+    if (_run_copy)
+    {
+        // copy src to dst before scatter op.
+        ITensorPack copy_pack{{ACL_SRC, src}, {ACL_DST, dst}};
+        CLScheduler::get().enqueue_op(*_copy_kernel, copy_pack, false);
+    }
+
+    ITensorPack scatter_pack{{ACL_SRC_0, updates}, {ACL_SRC_1, indices}, {ACL_DST, dst}};
+    CLScheduler::get().enqueue_op(*_scatter_kernel, scatter_pack, false);
+}
+
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClScatter.h b/src/gpu/cl/operators/ClScatter.h
new file mode 100644
index 0000000000..a1b32fed45
--- /dev/null
+++ b/src/gpu/cl/operators/ClScatter.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H
+
+#include "arm_compute/function_info/ScatterInfo.h"
+
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace opencl
+{
+// Forward declaration
+class ClFillKernel;
+class ClScatterKernel;
+class ClCopyKernel;
+
+/** Basic operator to execute Scatter on OpenCL. This operator calls the following OpenCL kernels:
+ *
+ *  -# @ref kernels::ClScatterKernel
+ */
+class ClScatter : public IClOperator
+{
+public:
+    /** Constructor */
+    ClScatter();
+    /** Default destructor */
+    ~ClScatter() = default;
+    /** Initialise the kernel's inputs and output
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * @note indices must always be S32.
+     * @note Negative indices are treated as out of bounds.
+     * @note src, updates and dst tensors must be same datatype.
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source input tensor info. Can be nullptr when using "Add" Scatter Function with zero initialization.
+     * @param[in]  updates         Tensor info for tensor storing update values to use for scatter function. Data types supported: same as @p src.
+     * @param[in]  indices         Tensor info for tensor storing indices to use for scatter function. Data types supported: S32 only.
+     * @param[out] dst             Output tensor to store the result of the Scatter Function. Data types supported: same as @p src and @p updates.
+     * @param[in]  Scatter_info    Contains Scatter operation information described in @ref ScatterInfo.
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   const ITensorInfo      *updates,
+                   const ITensorInfo      *indices,
+                   ITensorInfo            *dst,
+                   const ScatterInfo      &Scatter_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClScatter::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *updates,
+                           const ITensorInfo *indices,
+                           const ITensorInfo *dst,
+                           const ScatterInfo &Scatter_info);
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<opencl::IClKernel> _scatter_kernel{nullptr};
+    std::unique_ptr<opencl::IClKernel> _fill_kernel{nullptr};
+    std::unique_ptr<opencl::IClKernel> _copy_kernel{nullptr};
+    bool                               _fill_zero{false};
+    bool                               _run_copy{false};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLSCATTER_H
diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
new file mode 100644
index 0000000000..427f6b4f92
--- /dev/null
+++ b/src/gpu/cl/operators/ClSoftmax.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClSoftmax.h"
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace opencl
+{
+
+ClSoftmax::ClSoftmax() : _aux_mem(InternalTensorIdx::COUNT)
+{
+}
+
+void ClSoftmax::configure(const CLCompileContext  &compile_context,
+                          const ITensorInfo       &src,
+                          ITensorInfo             &dst,
+                          const SoftmaxKernelInfo &info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst, info);
+
+    auto k = std::make_unique<kernels::ClSoftmaxKernel>();
+    k->configure(compile_context, src, dst, info);
+
+    _tmp_info = k->tmp_tensor_info();
+
+    _kernel = std::move(k);
+
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
+}
+
+Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+{
+    return kernels::ClSoftmaxKernel::validate(src, dst, info);
+}
+
+void ClSoftmax::run(ITensorPack &tensors)
+{
+    CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors);
+
+    tensors.add_tensor(TensorType::ACL_INT_0, tmp.get());
+
+    CLScheduler::get().enqueue_op(*_kernel, tensors, false);
+}
+
+experimental::MemoryRequirements ClSoftmax::workspace() const
+{
+    return _aux_mem;
+}
+
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h
new file mode 100644
index 0000000000..232fcfebd1
--- /dev/null
+++ b/src/gpu/cl/operators/ClSoftmax.h
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
+#define ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
+
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ITensorInfo;
+class ITensorPack;
+struct SoftmaxKernelInfo;
+
+namespace opencl
+{
+namespace kernels
+{
+class ClSoftmaxKernel;
+} // namespace kernels
+class ClSoftmax : public IClOperator
+{
+public:
+    /** Constructor */
+    ClSoftmax();
+    /** Configure the operator
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  src             Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax
+     * @param[out] dst             Destination tensor info. Data types supported: same as @p src
+     * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
+     */
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClSoftmax::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+
+    void run(ITensorPack &tensors) override;
+
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    enum InternalTensorIdx
+    {
+        TMP = 0,
+        COUNT,
+    };
+
+    TensorInfo                       _tmp_info{};
+    experimental::MemoryRequirements _aux_mem;
+};
+
+} // namespace opencl
+} // namespace arm_compute
+#endif // ACL_SRC_GPU_CL_OPERATORS_CLSOFTMAX_H
diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp
new file mode 100644
index 0000000000..5c6d0c3184
--- /dev/null
+++ b/src/gpu/cl/operators/ClSub.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClSub.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClSub::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
+    auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
+    k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
+    _kernel = std::move(k);
+}
+
+Status ClSub::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
+{
+    return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h
new file mode 100644
index 0000000000..6a97275b86
--- /dev/null
+++ b/src/gpu/cl/operators/ClSub.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_SUB_H
+#define ARM_COMPUTE_CL_SUB_H
+
+#include "arm_compute/function_info/ActivationLayerInfo.h"
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run arithmetic subtraction
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ */
+class ClSub : public IClOperator
+{
+public:
+    /** Configure function for a given list of arguments.
+     *
+     * Valid configurations (src1,src2) -> dst :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
+     * @param[in]      compile_context The compile context to be used.
+     * @param[in, out] src1            First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[in, out] src2            Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     *                                 The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0.
+     * @param[out]     dst             Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32.
+     * @param[in]      policy          Policy to use to handle overflow.
+     * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to @ref ClSub::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_SUB_H */
diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp
new file mode 100644
index 0000000000..28da0d640a
--- /dev/null
+++ b/src/gpu/cl/operators/ClTranspose.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClTranspose.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_LOG_PARAMS(src, dst);
+    auto k = std::make_unique<kernels::ClTransposeKernel>();
+    k->configure(compile_context, src, dst);
+    _kernel = std::move(k);
+}
+
+Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    return kernels::ClTransposeKernel::validate(src, dst);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClTranspose.h b/src/gpu/cl/operators/ClTranspose.h
new file mode 100644
index 0000000000..3642fc23f9
--- /dev/null
+++ b/src/gpu/cl/operators/ClTranspose.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_TRANSPOSE_H
+#define ARM_COMPUTE_CL_TRANSPOSE_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to run @ref kernels::ClTransposeKernel */
+class ClTranspose : public IClOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in] compile_context The compile context to be used.
+     * @param[in] src             The src tensor info. Data types supported: All.
+     * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClTranspose::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_TRANSPOSE_H */
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp
new file mode 100644
index 0000000000..cec438faeb
--- /dev/null
+++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClTransposedConvolution.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h"
+
+namespace arm_compute
+{
+namespace opencl
+{
+void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
+                                        const ITensorInfo      *input,
+                                        const ITensorInfo      *weights,
+                                        const ITensorInfo      *biases,
+                                        ITensorInfo            *output,
+                                        const PadStrideInfo    &deconv_info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info);
+    auto kernel_object = std::make_unique<kernels::ClTransposedConvolutionKernel>();
+    kernel_object->set_target(CLScheduler::get().target());
+    kernel_object->configure(compile_context, input, weights, biases, output, deconv_info);
+    _transposed_conv_kernel = std::move(kernel_object);
+}
+
+Status ClTransposedConvolution::validate(const ITensorInfo   *input,
+                                         const ITensorInfo   *weights,
+                                         const ITensorInfo   *biases,
+                                         const ITensorInfo   *output,
+                                         const PadStrideInfo &deconv_info)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
+    return Status{};
+}
+
+void ClTransposedConvolution::run(ITensorPack &tensors)
+{
+    CLScheduler::get().enqueue_op(*_transposed_conv_kernel.get(), tensors, false);
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h
new file mode 100644
index 0000000000..660c4f85c1
--- /dev/null
+++ b/src/gpu/cl/operators/ClTransposedConvolution.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H
+#define ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H
+
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClKernel.h"
+#include "src/gpu/cl/IClOperator.h"
+namespace arm_compute
+{
+namespace opencl
+{
+/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels:
+ *
+ * -# @ref opencl::ClTransposedConvolution
+ */
+class ClTransposedConvolution : public IClOperator
+{
+public:
+    /** Default constructor */
+    ClTransposedConvolution() = default;
+    /** Default Destructor */
+    ~ClTransposedConvolution() = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClTransposedConvolution(const ClTransposedConvolution &) = delete;
+    /** Default move constructor */
+    ClTransposedConvolution(ClTransposedConvolution &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClTransposedConvolution &operator=(const ClTransposedConvolution &) = delete;
+    /** Default move assignment operator */
+    ClTransposedConvolution &operator=(ClTransposedConvolution &&) = default;
+
+    /** Set the input, weights, biases and output tensors.
+     *
+     * @note: Only NHWC data layout is supported
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info with dimensions [IFM, width, height, batch]
+     *                             Data types supported: F16/F32/QASYMM8/QASYMM8_SIGNED.
+     * @param[in]  weights         Weight tensor info with dimensions [IFM, width, height, OFM].
+     *                             Data type supported: Same as @p input
+     * @param[in]  biases          (Optional) Biases tensor info. Biases are 1D tensor with dimension [OFM].
+     *                             Data type supported: Should match @p input data type if floating point, otherwise S32.
+     * @param[out] output          Output tensor info with dimensions [OFM, width, height, batch]
+     *                             The 1st dimension must be equal to the 4th dimension of the @p weights tensor.
+     *                             Data types supported: Same as @p input.
+     * @param[in]  deconv_info     Contains padding and stride information described in @ref PadStrideInfo.
+     *
+     */
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClTransposedConvolution::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
+
+    // Inherited method overridden
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
new file mode 100644
index 0000000000..8ec96b247e
--- /dev/null
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2018-2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/operators/ClWinogradConv2d.h"
+
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
+#include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
+#include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "support/Cast.h"
+
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace
+{
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout)
+{
+    Size2D output_tile = Size2D{};
+
+    const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
+
+    // Check if the input spatial dimensions are smaller than 4
+    const bool is_input_lt4_nchw =
+        (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+
+    if (kernel_max_dim == 3U)
+    {
+        if (kernel_dims == Size2D(3U, 3U))
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
+        }
+        else if (kernel_dims == Size2D(3U, 1U))
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
+        }
+        else
+        {
+            output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
+        }
+    }
+    else if (kernel_max_dim == 5U)
+    {
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U);
+    }
+    else if (kernel_max_dim == 7U)
+    {
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U);
+    }
+
+    return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+    // Check if we want to configure a Winograd configuration which requires fast math
+    using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+    std::vector<WinogradConfiguration> fast_math_winograd = {
+        WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))};
+
+    auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+                            std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+    return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
+
+Status validate_arguments(const ITensorInfo         *src,
+                          const ITensorInfo         *weights,
+                          const ITensorInfo         *biases,
+                          const ITensorInfo         *dst,
+                          const PadStrideInfo       &conv_info,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math)
+{
+    // Get indeces for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))),
+        "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))),
+        "Winograd only supports padding up to half kernel size");
+
+    // Check if the Winograd configuration requires fast math
+    if (!enable_fast_math)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+            src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                        "This Winograd configuration requires enable_fast_math=true");
+    }
+
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
+
+    // Validate input transform
+    const TensorShape input0_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
+
+    // Validate filter transform
+    const TensorShape input1_shape =
+        misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
+
+    // Validate batched matrix multiply
+    TensorShape batched_mm_output_shape = input0.tensor_shape();
+    batched_mm_output_shape[0]          = input1.tensor_shape()[0];
+    const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f,
+                         GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                  GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
+
+    // Configure output transform
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
+    return Status{};
+}
+
+} // namespace
+
+ClWinogradConv2d::ClWinogradConv2d()
+    : _batched_mm(),
+      _input_transform(std::make_unique<kernels::ClWinogradInputTransformKernel>()),
+      _filter_transform(std::make_unique<kernels::ClWinogradFilterTransformKernel>()),
+      _output_transform(std::make_unique<kernels::ClWinogradOutputTransformKernel>()),
+      _border_handler(),
+      _input0(),
+      _input1(),
+      _batched_mm_output(),
+      _is_prepared(false),
+      _aux_mem()
+{
+}
+
+ClWinogradConv2d::~ClWinogradConv2d() = default;
+
+void ClWinogradConv2d::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math)
+{
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math);
+
+    // Get indices for the width and height
+    const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
+
+    // Input shape, kernel size and output tile
+    const Size2D input_dims  = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]);
+    const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
+    const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
+
+    // Check if the Winograd configuration requires fast math
+    if (!enable_fast_math)
+    {
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1,
+                                                      DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                 "This Winograd configuration requires enable_fast_math=true");
+    }
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
+
+    _is_prepared = false;
+
+    // Configure input transform
+    _input_transform->configure(compile_context, src, &_input0, winograd_info);
+    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT,
+                              PixelValue());
+
+    // Configure filter transform
+    _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
+
+    // Configure batched matrix multiply
+    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f,
+                          GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                   GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)));
+
+    // Configure output transform
+    _output_transform->set_target(CLScheduler::get().target());
+    _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
+
+    _aux_mem = _batched_mm.workspace();
+    const MemoryLifetime wino_wei_lifetm =
+        std::any_of(std::begin(_aux_mem), std::end(_aux_mem),
+                    [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); })
+            ? MemoryLifetime::Prepare
+            : MemoryLifetime::Persistent;
+    _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
+    _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
+    _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
+}
+
+Status ClWinogradConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
+    return Status{};
+}
+
+void ClWinogradConv2d::run(ITensorPack &tensors)
+{
+    const bool is_gemm_reshaped = _aux_mem[3].lifetime == MemoryLifetime::Prepare;
+
+    auto src    = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst    = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+
+    CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true);
+    CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true, is_gemm_reshaped);
+    CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true);
+
+    prepare(tensors);
+
+    // Run input transform
+    ITensorPack pack_it{
+        {TensorType::ACL_SRC, src},
+        {TensorType::ACL_DST, input0.get()},
+    };
+    CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
+    CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
+
+    // Run batched matrix multiplication
+    ITensorPack pack_mm = tensors;
+    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
+    pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
+    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1)
+                     : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+    _batched_mm.run(pack_mm);
+
+    // Run output transform
+    ITensorPack pack_ot{
+        {TensorType::ACL_SRC_0, batched_mm_output.get()},
+        {TensorType::ACL_SRC_1, biases},
+        {TensorType::ACL_DST, dst},
+    };
+    CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
+}
+
+void ClWinogradConv2d::prepare(ITensorPack &tensors)
+{
+    if (!_is_prepared)
+    {
+        auto weights =
+            utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
+
+        CLAuxTensorHandler input1(_input1, *in1_aux);
+        ITensorPack        pack_ft{
+            {TensorType::ACL_SRC, weights},
+            {TensorType::ACL_DST, input1.get()},
+        };
+        // Run filter transform and mark original weights as unused
+        CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
+        weights->mark_as_unused();
+
+        // Prepare GEMM and release reshaped weights if marked unused by ClGemm
+        ITensorPack mm_prepare_pack = tensors;
+        mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get());
+        _batched_mm.prepare(mm_prepare_pack);
+
+        CLScheduler::get().queue().finish();
+        _is_prepared = true;
+    }
+}
+
+experimental::MemoryRequirements ClWinogradConv2d::workspace() const
+{
+    return _aux_mem;
+}
+} // namespace opencl
+} // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h
new file mode 100644
index 0000000000..54ec1a1737
--- /dev/null
+++ b/src/gpu/cl/operators/ClWinogradConv2d.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CL_WINOGRADCONV2D_H
+#define ARM_COMPUTE_CL_WINOGRADCONV2D_H
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/IClOperator.h"
+#include "src/gpu/cl/operators/ClGemm.h"
+
+namespace arm_compute
+{
+class CLCompileContext;
+class ITensorInfo;
+namespace opencl
+{
+namespace kernels
+{
+class ClWinogradInputTransformKernel;
+class ClWinogradFilterTransformKernel;
+class ClWinogradOutputTransformKernel;
+} // namespace kernels
+/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
+ *
+ *  -# @ref kernels::ClWinogradInputTransformKernel
+ *  -# @ref kernels::ClWinogradFilterTransformKernel (only once)
+ *  -# @ref ClGemm
+ *  -# @ref kernels::ClWinogradOutputTransformKernel
+ *
+ */
+class ClWinogradConv2d : public IClOperator
+{
+public:
+    /** Default constructor */
+    ClWinogradConv2d();
+    /** Default destructor */
+    ~ClWinogradConv2d();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClWinogradConv2d(const ClWinogradConv2d &) = delete;
+    /** Default move constructor */
+    ClWinogradConv2d(ClWinogradConv2d &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ClWinogradConv2d &operator=(const ClWinogradConv2d &) = delete;
+    /** Default move assignment operator */
+    ClWinogradConv2d &operator=(ClWinogradConv2d &&) = default;
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - NHWC
+     * - NCHW
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2   |dst            |
+     * |:--------------|:--------------|:------|:--------------|
+     * |F16            |F16            |F16    |F16            |
+     * |F32            |F32            |F32    |F32            |
+     *
+     * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout
+     * @note  Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true
+     *
+     * @param[in]  compile_context  The compile context to be used.
+     * @param[in]  src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                              while every optional dimension from 4 and above represent a batch of inputs.
+     *                              Data types supported: F16/F32.
+     * @param[in]  weights          Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src.
+     * @param[in]  biases           Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p src
+     * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                              Data types supported: Same as @p src.
+     * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
+     * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     */
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
+    /** Static function to check if given info will lead to a valid configuration
+     *
+     * Similar to ClWinogradConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
+
+    // Inherited method overridden
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    experimental::MemoryRequirements workspace() const override;
+
+private:
+    ClGemm                                                    _batched_mm;
+    std::unique_ptr<kernels::ClWinogradInputTransformKernel>  _input_transform;
+    std::unique_ptr<kernels::ClWinogradFilterTransformKernel> _filter_transform;
+    std::unique_ptr<kernels::ClWinogradOutputTransformKernel> _output_transform;
+    CLFillBorderKernel                                        _border_handler;
+    TensorInfo                                                _input0;
+    TensorInfo                                                _input1;
+    TensorInfo                                                _batched_mm_output;
+    bool                                                      _is_prepared;
+    experimental::MemoryRequirements                          _aux_mem{};
+};
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_WINOGRADCONV2D_H */